public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2.5.45] NUMA Scheduler  (1/2)
@ 2002-10-31 23:31 Michael Hohnbaum
  2002-10-31 23:33 ` [PATCH 2.5.45] NUMA Scheduler (2/2) Michael Hohnbaum
  2002-10-31 23:52 ` [PATCH 2.5.45] NUMA Scheduler (1/2) Martin J. Bligh
  0 siblings, 2 replies; 6+ messages in thread
From: Michael Hohnbaum @ 2002-10-31 23:31 UTC (permalink / raw)
  To: Linus Torvalds, linux-kernel; +Cc: mingo, Erich Focht

[-- Attachment #1: Type: text/plain, Size: 878 bytes --]

Linus,

Erich Focht has written scheduler extensions in support of
NUMA systems.  These extensions are being used at customer
sites.  I have branched off and done some similar NUMA scheduler
extensions, though on a much smaller scale.  We have combined
efforts and produced two patches which provide minimal NUMA
scheduler capabilities.

The first patch provides node awareness to the load balancer.
This is derived directly from Erich's Node aware NUMA scheduler.
The second patch adds load balancing at exec.  This is derived
from work that I have done.  Together, these patches provide
performance gains for kernel compilation of between 5 and 10%.
On memory bandwidth extensive microbenchmarks we have seen gains
of 50-100%.  

Please consider for inclusion in 2.5.

-- 

Michael Hohnbaum                      503-578-5486
hohnbaum@us.ibm.com                   T/L 775-5486

[-- Attachment #2: 01-numa_sched_core-2.5.44-21.patch --]
[-- Type: text/x-patch, Size: 14066 bytes --]

diff -urNp a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Sat Oct 19 06:01:53 2002
+++ b/arch/i386/kernel/smpboot.c	Thu Oct 31 18:40:29 2002
@@ -1195,6 +1195,9 @@ int __devinit __cpu_up(unsigned int cpu)
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	zap_low_mappings();
+#ifdef CONFIG_NUMA
+	build_node_data();
+#endif
 }
 
 void __init smp_intr_init()
diff -urNp a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
--- a/arch/ia64/kernel/smpboot.c	Thu Oct 31 12:49:25 2002
+++ b/arch/ia64/kernel/smpboot.c	Thu Oct 31 18:40:29 2002
@@ -397,7 +397,7 @@ unsigned long cache_decay_ticks;	/* # of
 static void
 smp_tune_scheduling (void)
 {
-	cache_decay_ticks = 10;	/* XXX base this on PAL info and cache-bandwidth estimate */
+	cache_decay_ticks = 8;	/* XXX base this on PAL info and cache-bandwidth estimate */
 
 	printk("task migration cache decay timeout: %ld msecs.\n",
 	       (cache_decay_ticks + 1) * 1000 / HZ);
@@ -522,6 +522,9 @@ smp_cpus_done (unsigned int dummy)
 
 	printk(KERN_INFO"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 	       num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100);
+#ifdef CONFIG_NUMA
+	build_node_data();
+#endif
 }
 
 int __devinit
diff -urNp a/arch/ppc64/kernel/smp.c b/arch/ppc64/kernel/smp.c
--- a/arch/ppc64/kernel/smp.c	Sat Oct 19 06:02:27 2002
+++ b/arch/ppc64/kernel/smp.c	Thu Oct 31 18:40:29 2002
@@ -679,4 +679,7 @@ void __init smp_cpus_done(unsigned int m
 
 	/* XXX fix this, xics currently relies on it - Anton */
 	smp_threads_ready = 1;
+#ifdef CONFIG_NUMA
+	build_node_data();
+#endif
 }
diff -urNp a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	Thu Oct 31 11:34:14 2002
+++ b/include/linux/sched.h	Thu Oct 31 18:40:29 2002
@@ -22,6 +22,7 @@ extern unsigned long event;
 #include <asm/mmu.h>
 
 #include <linux/smp.h>
+#include <asm/topology.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
 #include <linux/securebits.h>
@@ -157,7 +158,6 @@ extern void update_one_process(struct ta
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
 
-
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);
@@ -443,6 +443,9 @@ extern void set_cpus_allowed(task_t *p, 
 # define set_cpus_allowed(p, new_mask) do { } while (0)
 #endif
 
+#ifdef CONFIG_NUMA
+extern void build_pools(void);
+#endif
 extern void set_user_nice(task_t *p, long nice);
 extern int task_prio(task_t *p);
 extern int task_nice(task_t *p);
diff -urNp a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	Sat Oct 19 06:02:28 2002
+++ b/kernel/sched.c	Thu Oct 31 19:23:57 2002
@@ -157,6 +157,9 @@ struct runqueue {
 	task_t *migration_thread;
 	struct list_head migration_queue;
 
+	unsigned long wait_time;
+	int wait_node;
+
 } ____cacheline_aligned;
 
 static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
@@ -176,6 +179,56 @@ static struct runqueue runqueues[NR_CPUS
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
 
+#define cpu_to_node(cpu) __cpu_to_node(cpu)
+#define LOADSCALE 128
+
+#ifdef CONFIG_NUMA
+/* Number of CPUs per node: sane values until all CPUs are up */
+int _node_nr_cpus[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = NR_CPUS };
+int node_ptr[MAX_NUMNODES+1]; /* first cpu of node (logical cpus are sorted!)*/
+#define node_ncpus(node)  _node_nr_cpus[node]
+
+#define NODE_DELAY_IDLE  (1*HZ/1000)
+#define NODE_DELAY_BUSY  (20*HZ/1000)
+
+/* the following macro implies that logical CPUs are sorted by node number */
+#define loop_over_node(cpu,node) \
+	for(cpu=node_ptr[node]; cpu<node_ptr[node+1]; cpu++)
+
+/*
+ * Build node_ptr and node_ncpus data after all CPUs have come up. This
+ * expects that the logical CPUs are sorted by their node numbers! Check
+ * out the NUMA API for details on why this should be that way.     [EF]
+ */
+void build_node_data(void)
+{
+	int n, cpu, ptr;
+	unsigned long mask;
+
+	ptr=0;
+	for (n=0; n<numnodes; n++) {
+		mask = __node_to_cpu_mask(n) & cpu_online_map;
+		node_ptr[n] = ptr;
+		for (cpu=0; cpu<NR_CPUS; cpu++)
+			if (mask  & (1UL << cpu))
+				ptr++;
+		node_ncpus(n) = ptr - node_ptr[n];;
+	}
+	printk("CPU nodes : %d\n",numnodes);
+	for (n=0; n<numnodes; n++)
+		printk("node %d : %d .. %d\n",n,node_ptr[n],node_ptr[n+1]-1);
+	if (cache_decay_ticks==1)
+		printk("WARNING: cache_decay_ticks=1, probably unset by platform. Running with poor CPU affinity!\n");
+}
+
+#else
+#define node_ncpus(node)  num_online_cpus()
+#define NODE_DELAY_IDLE 0
+#define NODE_DELAY_BUSY 0
+#define loop_over_node(cpu,n) for(cpu=0; cpu<NR_CPUS; cpu++)
+#endif
+
+
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
  * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -635,81 +688,134 @@ static inline unsigned int double_lock_b
 }
 
 /*
- * find_busiest_queue - find the busiest runqueue.
- */
-static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance)
-{
-	int nr_running, load, max_load, i;
-	runqueue_t *busiest, *rq_src;
+ * Find a runqueue from which to steal a task. We try to do this as locally as
+ * possible because we don't want to let tasks get far from their node.
+ * 
+ * 1. First try to find a runqueue within the own node with
+ * imbalance larger than 25% (relative to the current runqueue).
+ * 2. If the local node is well balanced, locate the most loaded node and its
+ * most loaded CPU.
+ *
+ * This routine implements node balancing by delaying steals from remote
+ * nodes more if the own node is (within margins) averagely loaded. The
+ * most loaded node is remembered as well as the time (jiffies). In the
+ * following calls to the load_balancer the time is compared with
+ * NODE_DELAY_BUSY (if load is around the average) or NODE_DELAY_IDLE (if own
+ * node is unloaded) if the most loaded node didn't change. This gives less 
+ * loaded nodes the chance to approach the average load but doesn't exclude
+ * busy nodes from stealing (just in case the cpus_allowed mask isn't good
+ * for the idle nodes).
+ * This concept can be extended easilly to more than two levels (multi-level
+ * scheduler), e.g.: CPU -> node -> supernode... by implementing node-distance
+ * dependent steal delays.
+ *
+ *                                                         <efocht@ess.nec.de>
+ */
+
+struct node_queue_data {
+	int total_load;
+	int average_load;
+	int busiest_cpu;
+	int busiest_cpu_load;
+};
+
+/*
+ * Check if CPUs are balanced. The check is more involved than the O(1) original
+ * because that one is simply wrong in certain cases (integer arithmetic !!!) EF
+ */
+#define CPUS_BALANCED(m,t) (((m)<=1) || (((m)-(t))/2 < (((m)+(t))/2 + 3)/4))
+/*
+ * Check if nodes are imbalanced. "this" node is balanced (compared to the "comp"
+ * node) when it's load is not smaller than "comp"'s load - LOADSCALE/2.
+ */
+#define NODES_BALANCED(comp,this) (((comp)-(this)) < LOADSCALE/2)
+
+static inline runqueue_t *find_busiest_queue(int this_cpu, int idle, int *nr_running)
+{
+	runqueue_t *busiest_rq = NULL, *this_rq = cpu_rq(this_cpu), *src_rq;
+	int busiest_cpu, busiest_node=0, cpu, load, max_avg_load, avg_load;
+	int n, steal_delay, system_load = 0, this_node=cpu_to_node(this_cpu); 
+	struct node_queue_data nd[MAX_NUMNODES], *node;
 
-	/*
-	 * We search all runqueues to find the most busy one.
-	 * We do this lockless to reduce cache-bouncing overhead,
-	 * we re-check the 'best' source CPU later on again, with
-	 * the lock held.
-	 *
-	 * We fend off statistical fluctuations in runqueue lengths by
-	 * saving the runqueue length during the previous load-balancing
-	 * operation and using the smaller one the current and saved lengths.
-	 * If a runqueue is long enough for a longer amount of time then
-	 * we recognize it and pull tasks from it.
-	 *
-	 * The 'current runqueue length' is a statistical maximum variable,
-	 * for that one we take the longer one - to avoid fluctuations in
-	 * the other direction. So for a load-balance to happen it needs
-	 * stable long runqueue on the target CPU and stable short runqueue
-	 * on the local runqueue.
-	 *
-	 * We make an exception if this CPU is about to become idle - in
-	 * that case we are less picky about moving a task across CPUs and
-	 * take what can be taken.
-	 */
 	if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
-		nr_running = this_rq->nr_running;
+		*nr_running = this_rq->nr_running;
 	else
-		nr_running = this_rq->prev_nr_running[this_cpu];
+		*nr_running = this_rq->prev_nr_running[this_cpu];
 
-	busiest = NULL;
-	max_load = 1;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_online(i))
-			continue;
+	for (n = 0; n < numnodes; n++) {
+		nd[n].busiest_cpu_load = -1;
+		nd[n].total_load = 0;
+	}
 
-		rq_src = cpu_rq(i);
-		if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
-			load = rq_src->nr_running;
+	/* compute all node loads and save their max cpu loads */
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_online(cpu)) continue;
+		node = &nd[cpu_to_node(cpu)];
+		src_rq = cpu_rq(cpu);
+		if (idle || (src_rq->nr_running < this_rq->prev_nr_running[cpu]))
+			load = src_rq->nr_running;
 		else
-			load = this_rq->prev_nr_running[i];
-		this_rq->prev_nr_running[i] = rq_src->nr_running;
-
-		if ((load > max_load) && (rq_src != this_rq)) {
-			busiest = rq_src;
-			max_load = load;
+			load = this_rq->prev_nr_running[cpu];
+		this_rq->prev_nr_running[cpu] = src_rq->nr_running;
+		node->total_load += load;
+		if (load > node->busiest_cpu_load) {
+			node->busiest_cpu_load = load;
+			node->busiest_cpu = cpu;
 		}
 	}
 
-	if (likely(!busiest))
-		goto out;
+	busiest_cpu = nd[this_node].busiest_cpu;
+	if (busiest_cpu != this_cpu) {
+		if (!CPUS_BALANCED(nd[this_node].busiest_cpu_load,*nr_running)) {
+			busiest_rq = cpu_rq(busiest_cpu);
+			this_rq->wait_node = -1;
+			goto out;
+		}
+	}
 
-	*imbalance = (max_load - nr_running) / 2;
+#ifdef CONFIG_NUMA
+	max_avg_load = -1;
+	for (n = 0; n < numnodes; n++) {
+		node = &nd[n];
+		system_load += node->total_load;
+		node->average_load = node->total_load*LOADSCALE/node_ncpus(n);
+		if (node->average_load > max_avg_load) {
+			max_avg_load = node->average_load;
+			busiest_node = n;
+		}
+	}
 
-	/* It needs an at least ~25% imbalance to trigger balancing. */
-	if (!idle && (*imbalance < (max_load + 3)/4)) {
-		busiest = NULL;
+	/* Exit if not enough imbalance on any remote node. */
+	if ((busiest_node == this_node) || (max_avg_load <= LOADSCALE) ||
+	    NODES_BALANCED(max_avg_load,nd[this_node].average_load)) {
+		this_rq->wait_node = -1;
 		goto out;
 	}
 
-	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
-	/*
-	 * Make sure nothing changed since we checked the
-	 * runqueue length.
-	 */
-	if (busiest->nr_running <= nr_running + 1) {
-		spin_unlock(&busiest->lock);
-		busiest = NULL;
+	busiest_cpu = nd[busiest_node].busiest_cpu;
+	avg_load = system_load*LOADSCALE/num_online_cpus();
+	/* Wait longer before stealing if own node's load is average. */
+	if (NODES_BALANCED(avg_load,nd[this_node].average_load))
+		steal_delay = NODE_DELAY_BUSY;
+	else
+		steal_delay = NODE_DELAY_IDLE;
+	/* if we have a new most loaded node: just mark it */
+	if (this_rq->wait_node != busiest_node) {
+		this_rq->wait_node = busiest_node;
+		this_rq->wait_time = jiffies;
+		goto out;
+	} else
+	/* old most loaded node: check if waited enough */
+		if (jiffies - this_rq->wait_time < steal_delay)
+			goto out;
+
+	if ((!CPUS_BALANCED(nd[busiest_node].busiest_cpu_load,*nr_running))) {
+		busiest_rq = cpu_rq(busiest_cpu);
+		this_rq->wait_node = -1;
 	}
-out:
-	return busiest;
+#endif
+ out:
+	return busiest_rq;
 }
 
 /*
@@ -741,16 +847,21 @@ static inline void pull_task(runqueue_t 
  */
 static void load_balance(runqueue_t *this_rq, int idle)
 {
-	int imbalance, idx, this_cpu = smp_processor_id();
+	int imbalance, nr_running, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
 	prio_array_t *array;
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance);
+	busiest = find_busiest_queue(this_cpu, idle, &nr_running);
 	if (!busiest)
 		goto out;
 
+	imbalance = (busiest->nr_running - nr_running)/2;
+
+	nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running);
+	if (busiest->nr_running <= nr_running + 1)
+		goto out_unlock;
 	/*
 	 * We first consider expired tasks. Those will likely not be
 	 * executed in the near future, and they are most likely to
@@ -822,10 +933,10 @@ out:
  * frequency and balancing agressivity depends on whether the CPU is
  * idle or not.
  *
- * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
  * systems with HZ=100, every 10 msecs.)
  */
-#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
+#define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 
 static inline void idle_tick(runqueue_t *rq)
@@ -2036,7 +2147,8 @@ static int migration_thread(void * data)
 		spin_unlock_irqrestore(&rq->lock, flags);
 
 		p = req->task;
-		cpu_dest = __ffs(p->cpus_allowed);
+		cpu_dest = __ffs(p->cpus_allowed & cpu_online_map);
+
 		rq_dest = cpu_rq(cpu_dest);
 repeat:
 		cpu_src = task_cpu(p);
@@ -2136,6 +2248,8 @@ void __init sched_init(void)
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
+	if (cache_decay_ticks)
+		cache_decay_ticks=1;
 	/*
 	 * We have to do a little magic to get the first
 	 * thread right in SMP mode.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2.5.45] NUMA Scheduler  (2/2)
  2002-10-31 23:31 [PATCH 2.5.45] NUMA Scheduler (1/2) Michael Hohnbaum
@ 2002-10-31 23:33 ` Michael Hohnbaum
  2002-10-31 23:52 ` [PATCH 2.5.45] NUMA Scheduler (1/2) Martin J. Bligh
  1 sibling, 0 replies; 6+ messages in thread
From: Michael Hohnbaum @ 2002-10-31 23:33 UTC (permalink / raw)
  To: Michael Hohnbaum; +Cc: Linus Torvalds, linux-kernel, mingo, Erich Focht

[-- Attachment #1: Type: text/plain, Size: 996 bytes --]

On Thu, 2002-10-31 at 15:31, Michael Hohnbaum wrote:
> Linus,
> 
> Erich Focht has written scheduler extensions in support of
> NUMA systems.  These extensions are being used at customer
> sites.  I have branched off and done some similar NUMA scheduler
> extensions, though on a much smaller scale.  We have combined
> efforts and produced two patches which provide minimal NUMA
> scheduler capabilities.
> 
> The first patch provides node awareness to the load balancer.
> This is derived directly from Erich's Node aware NUMA scheduler.
> The second patch adds load balancing at exec.  This is derived
> from work that I have done.  Together, these patches provide
> performance gains for kernel compilation of between 5 and 10%.
> On memory bandwidth extensive microbenchmarks we have seen gains
> of 50-100%.  
> 
> Please consider for inclusion in 2.5.
> 

Here is the second patch.
-- 

Michael Hohnbaum                      503-578-5486
hohnbaum@us.ibm.com                   T/L 775-5486

[-- Attachment #2: 03-numa_sched_ilbMH-2.5.43-20.patch --]
[-- Type: text/x-patch, Size: 5137 bytes --]

diff -urNp c/fs/exec.c d/fs/exec.c
--- c/fs/exec.c	Wed Oct 16 05:27:53 2002
+++ d/fs/exec.c	Wed Oct 30 14:25:59 2002
@@ -1001,6 +1001,8 @@ int do_execve(char * filename, char ** a
 	int retval;
 	int i;
 
+	sched_balance_exec();
+
 	file = open_exec(filename);
 
 	retval = PTR_ERR(file);
diff -urNp c/include/linux/sched.h d/include/linux/sched.h
--- c/include/linux/sched.h	Wed Oct 30 14:02:01 2002
+++ d/include/linux/sched.h	Wed Oct 30 14:25:59 2002
@@ -167,6 +167,19 @@ extern void update_one_process(struct ta
 			       unsigned long system, int cpu);
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
+#ifdef CONFIG_NUMA
+extern void sched_balance_exec(void);
+extern void node_nr_running_init(void);
+#define nr_running_inc(rq) atomic_inc(rq->node_ptr); \
+	rq->nr_running++
+#define nr_running_dec(rq) atomic_dec(rq->node_ptr); \
+	rq->nr_running--
+#else
+#define sched_balance_exec() {}
+#define node_nr_running_init() {}
+#define nr_running_inc(rq) rq->nr_running++
+#define nr_running_dec(rq) rq->nr_running--
+#endif
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
diff -urNp c/init/main.c d/init/main.c
--- c/init/main.c	Wed Oct 16 05:27:19 2002
+++ d/init/main.c	Wed Oct 30 14:25:59 2002
@@ -495,6 +495,7 @@ static void do_pre_smp_initcalls(void)
 
 	migration_init();
 #endif
+	node_nr_running_init();
 	spawn_ksoftirqd();
 }
 
diff -urNp c/kernel/sched.c d/kernel/sched.c
--- c/kernel/sched.c	Wed Oct 30 14:15:41 2002
+++ d/kernel/sched.c	Wed Oct 30 14:27:22 2002
@@ -153,6 +153,7 @@ struct runqueue {
 	task_t *curr, *idle;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_nr_running[NR_CPUS];
+	atomic_t * node_ptr;
 
 	task_t *migration_thread;
 	struct list_head migration_queue;
@@ -346,7 +347,7 @@ static inline void activate_task(task_t 
 		p->prio = effective_prio(p);
 	}
 	enqueue_task(p, array);
-	rq->nr_running++;
+	nr_running_inc(rq);
 }
 
 /*
@@ -354,7 +355,7 @@ static inline void activate_task(task_t 
  */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	nr_running_dec(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
@@ -897,9 +898,9 @@ skip_queue:
 static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	nr_running_dec(src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	nr_running_inc(this_rq);
 	enqueue_task(p, this_rq->active);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -2228,6 +2229,83 @@ __init int migration_init(void)
 
 #endif
 
+#if CONFIG_NUMA
+static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
+
+__init void node_nr_running_init(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		cpu_rq(i)->node_ptr = &node_nr_running[__cpu_to_node(i)];
+	}
+	return;
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(task_t *p, int dest_cpu)
+{
+	unsigned long old_mask;
+
+	old_mask = p->cpus_allowed;
+	if (!(old_mask & (1UL << dest_cpu)))
+		return;
+	/* force the process onto the specified CPU */
+	set_cpus_allowed(p, 1UL << dest_cpu);
+
+	/* restore the cpus allowed mask */
+	set_cpus_allowed(p, old_mask);
+}
+
+/*
+ * Find the least loaded CPU.  Slightly favor the current CPU by
+ * setting its runqueue length as the minimum to start.
+ */
+static int sched_best_cpu(struct task_struct *p)
+{
+	int i, minload, load, best_cpu, node = 0;
+
+	best_cpu = task_cpu(p);
+	if (cpu_rq(best_cpu)->nr_running <= 2)
+		return best_cpu;
+
+	minload = 10000000;
+	for (i = 0; i < numnodes; i++) {
+		load = atomic_read(&node_nr_running[i]);
+		if (load < minload) {
+			minload = load;
+			node = i;
+		}
+	}
+	minload = 10000000;
+	loop_over_node(i,node) {
+		if (!cpu_online(i))
+			continue;
+		if (cpu_rq(i)->nr_running < minload) {
+			best_cpu = i;
+			minload = cpu_rq(i)->nr_running;
+		}
+	}
+	return best_cpu;
+}
+
+void sched_balance_exec(void)
+{
+	int new_cpu;
+
+	if (numnodes > 1) {
+		new_cpu = sched_best_cpu(current);
+		if (new_cpu != smp_processor_id())
+			sched_migrate_task(current, new_cpu);
+	}
+}
+#endif /* CONFIG_NUMA */
+
 #if CONFIG_SMP || CONFIG_PREEMPT
 /*
  * The 'big kernel lock'
@@ -2255,6 +2334,10 @@ void __init sched_init(void)
 		rq->expired = rq->arrays + 1;
 		spin_lock_init(&rq->lock);
 		INIT_LIST_HEAD(&rq->migration_queue);
+#if CONFIG_NUMA
+		rq->node_ptr = &node_nr_running[0];
+#endif /* CONFIG_NUMA */
+
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2.5.45] NUMA Scheduler  (1/2)
  2002-10-31 23:31 [PATCH 2.5.45] NUMA Scheduler (1/2) Michael Hohnbaum
  2002-10-31 23:33 ` [PATCH 2.5.45] NUMA Scheduler (2/2) Michael Hohnbaum
@ 2002-10-31 23:52 ` Martin J. Bligh
  2002-11-01  0:10   ` Robert Love
                     ` (2 more replies)
  1 sibling, 3 replies; 6+ messages in thread
From: Martin J. Bligh @ 2002-10-31 23:52 UTC (permalink / raw)
  To: Michael Hohnbaum, Linus Torvalds, linux-kernel; +Cc: mingo, Erich Focht

> Erich Focht has written scheduler extensions in support of
> NUMA systems.  These extensions are being used at customer
> sites.  I have branched off and done some similar NUMA scheduler
> extensions, though on a much smaller scale.  We have combined
> efforts and produced two patches which provide minimal NUMA
> scheduler capabilities.

Just wanted to add that everyone that's been involved in this is
now in harmonious agreement about this combined solution. If you're
curious as to where the benefits come from, the differences in 
kernel profiles are included below from a 16-way NUMA-Q doing a
kernel compile.

Positive numbers got worse with the patch, negative got better:
(differences below 50 ticks cut off to increase signal:noise ratio)

132 d_lookup
80 strnlen_user
72 atomic_dec_and_lock
...
-50 file_move
-58 pte_alloc_one
-83 __set_page_dirty_buffers
-84 do_wp_page
-109 free_hot_cold_page
-119 clear_page_tables
-128 __copy_to_user
-175 zap_pte_range
-194 buffered_rmqueue
-237 page_remove_rmap
-897 __copy_from_user
-907 do_anonymous_page

As would be expected most of the gain is in the memory management
functions (do_anonymous page is doing pre-zeroing of pages, and
is always the biggest item on these profiles).

M.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2.5.45] NUMA Scheduler  (1/2)
  2002-10-31 23:52 ` [PATCH 2.5.45] NUMA Scheduler (1/2) Martin J. Bligh
@ 2002-11-01  0:10   ` Robert Love
  2002-11-01  2:35   ` Martin J. Bligh
  2002-11-01 17:06   ` Erich Focht
  2 siblings, 0 replies; 6+ messages in thread
From: Robert Love @ 2002-11-01  0:10 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Michael Hohnbaum, Linus Torvalds, linux-kernel, mingo,
	Erich Focht

On Thu, 2002-10-31 at 18:52, Martin J. Bligh wrote:

> Just wanted to add that everyone that's been involved in this is
> now in harmonious agreement about this combined solution. If you're
> curious as to where the benefits come from, the differences in 
> kernel profiles are included below from a 16-way NUMA-Q doing a
> kernel compile.

Linus, although these patches are fairly straightforward and
non-impacting in the !CONFIG_NUMA case, would you prefer it if a
non-NUMA person who knew the scheduler (say, me) went over these patches
and merged them with you?

Ingo, do you have an opinion either way?  I think basic NUMA support,
especially in the load balancer, should make it in before 2.6.

	Robert Love


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2.5.45] NUMA Scheduler  (1/2)
  2002-10-31 23:52 ` [PATCH 2.5.45] NUMA Scheduler (1/2) Martin J. Bligh
  2002-11-01  0:10   ` Robert Love
@ 2002-11-01  2:35   ` Martin J. Bligh
  2002-11-01 17:06   ` Erich Focht
  2 siblings, 0 replies; 6+ messages in thread
From: Martin J. Bligh @ 2002-11-01  2:35 UTC (permalink / raw)
  To: Michael Hohnbaum, Linus Torvalds, linux-kernel; +Cc: mingo, Erich Focht

[-- Attachment #1: Type: text/plain, Size: 245 bytes --]

Hum. A last minute change broke UP compilation.
Attatched ... should come out as text/plain so you can read 
it, but if it all goes wrong, it just removes:

       if (cache_decay_ticks)
               cache_decay_ticks=1;

from sched_init.

M.

[-- Attachment #2: numaschedfix --]
[-- Type: text/plain, Size: 357 bytes --]

--- 2.5.45-numasched/kernel/sched.c.old	2002-10-31 17:48:41.000000000 -0800
+++ 2.5.45-numasched/kernel/sched.c	2002-10-31 17:51:43.000000000 -0800
@@ -2331,8 +2331,7 @@
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
-	if (cache_decay_ticks)
-		cache_decay_ticks=1;
+
 	/*
 	 * We have to do a little magic to get the first
 	 * thread right in SMP mode.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2.5.45] NUMA Scheduler  (1/2)
  2002-10-31 23:52 ` [PATCH 2.5.45] NUMA Scheduler (1/2) Martin J. Bligh
  2002-11-01  0:10   ` Robert Love
  2002-11-01  2:35   ` Martin J. Bligh
@ 2002-11-01 17:06   ` Erich Focht
  2 siblings, 0 replies; 6+ messages in thread
From: Erich Focht @ 2002-11-01 17:06 UTC (permalink / raw)
  To: Martin J. Bligh, Michael Hohnbaum, Linus Torvalds, linux-kernel; +Cc: mingo

On Friday 01 November 2002 00:52, Martin J. Bligh wrote:
> > Erich Focht has written scheduler extensions in support of
> > NUMA systems.  These extensions are being used at customer
> > sites.  I have branched off and done some similar NUMA scheduler
> > extensions, though on a much smaller scale.  We have combined
> > efforts and produced two patches which provide minimal NUMA
> > scheduler capabilities.
>
> Just wanted to add that everyone that's been involved in this is
> now in harmonious agreement about this combined solution.

Yes, I'd like to confirm this, too. Just couldn't be online last night
(european time) so I'm glad that Michael sent it all out. Happily
we have a holiday over here today, after Helloween ;-)

Regards,
Erich


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2002-11-01 17:00 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-10-31 23:31 [PATCH 2.5.45] NUMA Scheduler (1/2) Michael Hohnbaum
2002-10-31 23:33 ` [PATCH 2.5.45] NUMA Scheduler (2/2) Michael Hohnbaum
2002-10-31 23:52 ` [PATCH 2.5.45] NUMA Scheduler (1/2) Martin J. Bligh
2002-11-01  0:10   ` Robert Love
2002-11-01  2:35   ` Martin J. Bligh
2002-11-01 17:06   ` Erich Focht

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox