[PATCH 2.5.53] NUMA scheduler (1/3)

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Erich Focht <efocht@ess.nec.de>
To: "Martin J. Bligh" <mbligh@aracnet.com>,
	Michael Hohnbaum <hohnbaum@us.ibm.com>
Cc: Robert Love <rml@tech9.net>, Ingo Molnar <mingo@elte.hu>,
	Stephen Hemminger <shemminger@osdl.org>,
	linux-kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH 2.5.53] NUMA scheduler (1/3)
Date: Tue, 31 Dec 2002 14:29:04 +0100	[thread overview]
Message-ID: <200212311429.04382.efocht@ess.nec.de> (raw)
In-Reply-To: <200212181721.39434.efocht@ess.nec.de>

[-- Attachment #1: Type: text/plain, Size: 801 bytes --]

Here comes the minimal NUMA scheduler built on top of the O(1) load
balancer rediffed for 2.5.53 with some changes in the core part. As
suggested by Michael, I added the cputimes_stat patch, as it is
absolutely needed for understanding the scheduler behavior.

The three patches:
01-numa-sched-core-2.5.53-24.patch: core NUMA scheduler infrastructure
  providing a node aware load_balancer. Cosmetic changes + more comments.

02-numa-sched-ilb-2.5.53-21.patch: initial load balancing, selects
  least loaded node & CPU at exec().

03-cputimes_stat-2.5.53.patch: adds back to the kernel per CPU user
  and system time statistics for each process in /proc/PID/cpu. Needed
  for evaluating scheduler behavior and performance of tasks running
  on SMP and NUMA systems.

Regards,
Erich

[-- Attachment #2: 01-numa-sched-core-2.5.53-24.patch --]
[-- Type: text/x-diff, Size: 12460 bytes --]

diff -urN a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	2002-12-24 06:20:16.000000000 +0100
+++ b/arch/i386/kernel/smpboot.c	2002-12-31 13:02:47.000000000 +0100
@@ -1191,6 +1191,7 @@
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	zap_low_mappings();
+	build_node_data();
 }
 
 void __init smp_intr_init()
diff -urN a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
--- a/arch/ia64/kernel/smpboot.c	2002-12-24 06:19:49.000000000 +0100
+++ b/arch/ia64/kernel/smpboot.c	2002-12-31 13:03:02.000000000 +0100
@@ -397,7 +397,7 @@
 static void
 smp_tune_scheduling (void)
 {
-	cache_decay_ticks = 10;	/* XXX base this on PAL info and cache-bandwidth estimate */
+	cache_decay_ticks = 8;	/* XXX base this on PAL info and cache-bandwidth estimate */
 
 	printk("task migration cache decay timeout: %ld msecs.\n",
 	       (cache_decay_ticks + 1) * 1000 / HZ);
@@ -522,6 +522,7 @@
 
 	printk(KERN_INFO"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 	       num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
+	build_node_data();
 }
 
 int __devinit
diff -urN a/arch/ppc64/kernel/smp.c b/arch/ppc64/kernel/smp.c
--- a/arch/ppc64/kernel/smp.c	2002-12-24 06:21:01.000000000 +0100
+++ b/arch/ppc64/kernel/smp.c	2002-12-31 13:03:21.000000000 +0100
@@ -679,4 +679,5 @@
 
 	/* XXX fix this, xics currently relies on it - Anton */
 	smp_threads_ready = 1;
+	build_node_data();
 }
diff -urN a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	2002-12-24 06:19:35.000000000 +0100
+++ b/include/linux/sched.h	2002-12-31 13:02:18.000000000 +0100
@@ -445,6 +445,12 @@
 # define set_cpus_allowed(p, new_mask) do { } while (0)
 #endif
 
+#ifdef CONFIG_NUMA
+extern void build_node_data(void);
+#else
+#define build_node_data() {}
+#endif
+
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
diff -urN a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	2002-12-24 06:21:05.000000000 +0100
+++ b/kernel/sched.c	2002-12-31 13:46:03.000000000 +0100
@@ -158,6 +158,10 @@
 	struct list_head migration_queue;
 
 	atomic_t nr_iowait;
+#ifdef CONFIG_NUMA
+	unsigned long wait_time;
+	int wait_node;
+#endif
 } ____cacheline_aligned;
 
 static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
@@ -178,6 +182,64 @@
 #endif
 
 /*
+ * Node loads are scaled with LOADSCALE. This way:
+ * - we avoid zeros in integer divisions (dividing by node CPU number),
+ * - loads of nodes with different numbers of CPUs are comparable.
+ */
+#define LOADSCALE 128
+
+#ifdef CONFIG_NUMA
+/* Number of CPUs per node: sane values until all CPUs are up */
+static int _node_nr_cpus[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = NR_CPUS };
+static int node_ptr[MAX_NUMNODES+1]; /* first cpu of node (logical cpus are sorted!)*/
+#define node_nr_cpus(node)  _node_nr_cpus[node]
+
+/*
+ * Delay stealing from remote node when own queue is idle/busy. These delays
+ * tend to equalize the node loads. NODE_DELAY_IDLE is nonzero because we
+ * want to give idle CPUs in the busiest node a chance to steal first.
+ */
+#define NODE_DELAY_IDLE  (1*HZ/1000)
+#define NODE_DELAY_BUSY  (20*HZ/1000)
+
+/* the following macro implies that logical CPUs are sorted by node number */
+#define loop_over_node(cpu,node) \
+	for(cpu = node_ptr[node]; cpu < node_ptr[node+1]; cpu++)
+
+/*
+ * Build node_ptr and node_nr_cpus data after all CPUs have come up. This
+ * expects that the logical CPUs are sorted by their node numbers! Check
+ * out the NUMA API for details on why this should be that way.     [EF]
+ */
+void build_node_data(void)
+{
+	int n, cpu, ptr;
+	unsigned long mask;
+
+	ptr=0;
+	for (n = 0; n < numnodes; n++) {
+		mask = __node_to_cpu_mask(n) & cpu_online_map;
+		node_ptr[n] = ptr;
+		for (cpu = 0; cpu < NR_CPUS; cpu++)
+			if (mask  & (1UL << cpu))
+				ptr++;
+		node_nr_cpus(n) = ptr - node_ptr[n];
+	}
+	node_ptr[numnodes] = ptr;
+	printk("CPU nodes : %d\n",numnodes);
+	for (n = 0; n < numnodes; n++)
+		printk("node %d : %d .. %d\n",n,node_ptr[n],node_ptr[n+1]-1);
+}
+
+#else
+#define node_nr_cpus(node)  num_online_cpus()
+#define NODE_DELAY_IDLE 0
+#define NODE_DELAY_BUSY 0
+#define loop_over_node(cpu,n) for(cpu = 0; cpu < NR_CPUS; cpu++)
+#endif
+
+
+/*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
  * explicitly disabling preemption.
@@ -652,81 +714,134 @@
 }
 
 /*
- * find_busiest_queue - find the busiest runqueue.
+ * Find a runqueue from which to steal a task. We try to do this as locally as
+ * possible because we don't want to let tasks get far from their node.
+ * 
+ * 1. First try to find a runqueue within the own node with
+ * imbalance larger than 25% (relative to the current runqueue).
+ * 2. If the local node is well balanced, locate the most loaded node and its
+ * most loaded CPU.
+ *
+ * This routine implements node balancing by delaying steals from remote
+ * nodes more if the own node is (within margins) averagely loaded. The
+ * most loaded node is remembered as well as the time (jiffies). In the
+ * following calls to the load_balancer the time is compared with
+ * NODE_DELAY_BUSY (if load is around the average) or NODE_DELAY_IDLE (if own
+ * node is unloaded) if the most loaded node didn't change. This gives less 
+ * loaded nodes the chance to approach the average load but doesn't exclude
+ * busy nodes from stealing (just in case the cpus_allowed mask isn't good
+ * for the idle nodes).
+ * This concept can be extended easilly to more than two levels (multi-level
+ * scheduler), e.g.: CPU -> node -> supernode... by implementing node-distance
+ * dependent steal delays.
+ *
+ *                                                         <efocht@ess.nec.de>
+ */
+
+struct node_queue_data {
+	int total_load;
+	int average_load;
+	int busiest_cpu;
+	int busiest_cpu_load;
+};
+
+/*
+ * Check if CPUs are balanced. The check is more involved than the O(1) original
+ * because that one is simply wrong in certain cases (integer arithmetic !!!) EF
+ */
+#define cpus_balanced(m,t) (((m)<=1) || (((m)-(t))/2 < (((m)+(t))/2 + 3)/4))
+/*
+ * Check if nodes are imbalanced. "this" node is balanced (compared to the "comp"
+ * node) when it's load is not smaller than "comp"'s load - LOADSCALE/2.
  */
-static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance)
+#define nodes_balanced(comp,this) (((comp)-(this)) < LOADSCALE/2)
+
+static inline runqueue_t *find_busiest_queue(int this_cpu, int idle, int *nr_running)
 {
-	int nr_running, load, max_load, i;
-	runqueue_t *busiest, *rq_src;
+	runqueue_t *busiest_rq = NULL, *this_rq = cpu_rq(this_cpu), *src_rq;
+	int busiest_cpu, busiest_node=0, cpu, load, max_avg_load, avg_load;
+	int n, steal_delay, system_load = 0, this_node=__cpu_to_node(this_cpu); 
+	struct node_queue_data nd[MAX_NUMNODES], *node;
 
-	/*
-	 * We search all runqueues to find the most busy one.
-	 * We do this lockless to reduce cache-bouncing overhead,
-	 * we re-check the 'best' source CPU later on again, with
-	 * the lock held.
-	 *
-	 * We fend off statistical fluctuations in runqueue lengths by
-	 * saving the runqueue length during the previous load-balancing
-	 * operation and using the smaller one the current and saved lengths.
-	 * If a runqueue is long enough for a longer amount of time then
-	 * we recognize it and pull tasks from it.
-	 *
-	 * The 'current runqueue length' is a statistical maximum variable,
-	 * for that one we take the longer one - to avoid fluctuations in
-	 * the other direction. So for a load-balance to happen it needs
-	 * stable long runqueue on the target CPU and stable short runqueue
-	 * on the local runqueue.
-	 *
-	 * We make an exception if this CPU is about to become idle - in
-	 * that case we are less picky about moving a task across CPUs and
-	 * take what can be taken.
-	 */
 	if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
-		nr_running = this_rq->nr_running;
+		*nr_running = this_rq->nr_running;
 	else
-		nr_running = this_rq->prev_nr_running[this_cpu];
+		*nr_running = this_rq->prev_nr_running[this_cpu];
 
-	busiest = NULL;
-	max_load = 1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for (n = 0; n < numnodes; n++) {
+		nd[n].busiest_cpu_load = -1;
+		nd[n].total_load = 0;
+	}
 
-		rq_src = cpu_rq(i);
-		if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
-			load = rq_src->nr_running;
+	/* compute all node loads and save their max cpu loads */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_online(cpu))
+			continue;
+		node = &nd[__cpu_to_node(cpu)];
+		src_rq = cpu_rq(cpu);
+		if (idle || (src_rq->nr_running < this_rq->prev_nr_running[cpu]))
+			load = src_rq->nr_running;
 		else
-			load = this_rq->prev_nr_running[i];
-		this_rq->prev_nr_running[i] = rq_src->nr_running;
-
-		if ((load > max_load) && (rq_src != this_rq)) {
-			busiest = rq_src;
-			max_load = load;
+			load = this_rq->prev_nr_running[cpu];
+		this_rq->prev_nr_running[cpu] = src_rq->nr_running;
+		node->total_load += load;
+		if (load > node->busiest_cpu_load) {
+			node->busiest_cpu_load = load;
+			node->busiest_cpu = cpu;
 		}
 	}
 
-	if (likely(!busiest))
-		goto out;
+	busiest_cpu = nd[this_node].busiest_cpu;
+	if (busiest_cpu != this_cpu) {
+		if (!cpus_balanced(nd[this_node].busiest_cpu_load,*nr_running)) {
+			busiest_rq = cpu_rq(busiest_cpu);
+			this_rq->wait_node = -1;
+			goto out;
+		}
+	}
 
-	*imbalance = (max_load - nr_running) / 2;
+#ifdef CONFIG_NUMA
+	max_avg_load = -1;
+	for (n = 0; n < numnodes; n++) {
+		node = &nd[n];
+		system_load += node->total_load;
+		node->average_load = node->total_load*LOADSCALE/node_nr_cpus(n);
+		if (node->average_load > max_avg_load) {
+			max_avg_load = node->average_load;
+			busiest_node = n;
+		}
+	}
 
-	/* It needs an at least ~25% imbalance to trigger balancing. */
-	if (!idle && (*imbalance < (max_load + 3)/4)) {
-		busiest = NULL;
+	/* Exit if not enough imbalance on any remote node. */
+	if ((busiest_node == this_node) || (max_avg_load <= LOADSCALE) ||
+	    nodes_balanced(max_avg_load,nd[this_node].average_load)) {
+		this_rq->wait_node = -1;
 		goto out;
 	}
 
-	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
-	/*
-	 * Make sure nothing changed since we checked the
-	 * runqueue length.
-	 */
-	if (busiest->nr_running <= nr_running + 1) {
-		spin_unlock(&busiest->lock);
-		busiest = NULL;
+	busiest_cpu = nd[busiest_node].busiest_cpu;
+	avg_load = system_load*LOADSCALE/num_online_cpus();
+	/* Wait longer before stealing if own node's load is average. */
+	if (nodes_balanced(avg_load,nd[this_node].average_load))
+		steal_delay = NODE_DELAY_BUSY;
+	else
+		steal_delay = NODE_DELAY_IDLE;
+	/* if we have a new most loaded node: just mark it */
+	if (this_rq->wait_node != busiest_node) {
+		this_rq->wait_node = busiest_node;
+		this_rq->wait_time = jiffies;
+		goto out;
+	/* old most loaded node: check if waited enough */
+	} else if (jiffies - this_rq->wait_time < steal_delay)
+		goto out;
+
+	if ((!cpus_balanced(nd[busiest_node].busiest_cpu_load,*nr_running))) {
+		busiest_rq = cpu_rq(busiest_cpu);
+		this_rq->wait_node = -1;
 	}
-out:
-	return busiest;
+#endif
+ out:
+	return busiest_rq;
 }
 
 /*
@@ -758,16 +873,21 @@
  */
 static void load_balance(runqueue_t *this_rq, int idle)
 {
-	int imbalance, idx, this_cpu = smp_processor_id();
+	int imbalance, nr_running, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
 	prio_array_t *array;
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance);
+	busiest = find_busiest_queue(this_cpu, idle, &nr_running);
 	if (!busiest)
 		goto out;
 
+	imbalance = (busiest->nr_running - nr_running)/2;
+
+	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
+	if (busiest->nr_running <= nr_running + 1)
+		goto out_unlock;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
@@ -2110,7 +2230,8 @@
 		spin_unlock_irqrestore(&rq->lock, flags);
 
 		p = req->task;
-		cpu_dest = __ffs(p->cpus_allowed);
+		cpu_dest = __ffs(p->cpus_allowed & cpu_online_map);
+
 		rq_dest = cpu_rq(cpu_dest);
 repeat:
 		cpu_src = task_cpu(p);

next prev parent reply	other threads:[~2002-12-31 13:20 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-11-06 16:34 NUMA scheduler BK tree Erich Focht
2002-11-06 18:10 ` Michael Hohnbaum
2002-11-07 23:05   ` Erich Focht
2002-11-07 23:46 ` Michael Hohnbaum
2002-11-08 16:57   ` Erich Focht
2002-11-11 15:13 ` [PATCH 2.5.47] NUMA scheduler (1/2) Erich Focht
2002-11-11 15:14   ` [PATCH 2.5.47] NUMA scheduler (2/2) Erich Focht
2002-11-12  0:24   ` [PATCH 2.5.47] NUMA scheduler (1/2) Michael Hohnbaum
2002-11-18 19:40 ` NUMA scheduler BK tree Martin J. Bligh
2002-11-19 16:26   ` [PATCH 2.5.48] NUMA scheduler (1/2) Erich Focht
2002-11-19 16:27     ` [PATCH 2.5.48] NUMA scheduler (2/2) Erich Focht
2002-12-02 15:29     ` [PATCH 2.5.50] NUMA scheduler (1/2) Erich Focht
2002-12-02 15:30       ` [PATCH 2.5.50] NUMA scheduler (2/2) Erich Focht
2002-12-06 17:39       ` [PATCH 2.5.50] NUMA scheduler (1/2) Michael Hohnbaum
2002-12-18 16:21       ` [PATCH 2.5.52] " Erich Focht
2002-12-18 16:23         ` [PATCH 2.5.52] NUMA scheduler (2/2) Erich Focht
2002-12-20 14:49         ` [PATCH 2.5.52] NUMA scheduler: cputimes stats Erich Focht
2002-12-20 15:17         ` [PATCH 2.5.52] NUMA scheduler (1/2) Christoph Hellwig
2002-12-20 17:44           ` Erich Focht
2002-12-31 13:29         ` Erich Focht [this message]
2002-12-31 13:29           ` [PATCH 2.5.53] NUMA scheduler (2/3) Erich Focht
2002-12-31 13:30           ` [PATCH 2.5.53] NUMA scheduler (3/3) Erich Focht
2003-01-04  1:58           ` [PATCH 2.5.53] NUMA scheduler (1/3) Michael Hohnbaum
2003-01-05  5:35             ` Martin J. Bligh
2003-01-06  3:58               ` Michael Hohnbaum
2003-01-06  6:07                 ` Martin J. Bligh
2003-01-07  2:23                   ` Michael Hohnbaum
2003-01-07 11:27                     ` Erich Focht
2003-01-07 23:35                       ` Michael Hohnbaum

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200212311429.04382.efocht@ess.nec.de \
    --to=efocht@ess.nec.de \
    --cc=hohnbaum@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mbligh@aracnet.com \
    --cc=mingo@elte.hu \
    --cc=rml@tech9.net \
    --cc=shemminger@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.