All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Martin J. Bligh" <mbligh@aracnet.com>
To: Andrew Theurer <habanero@us.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>,
	Robert Love <rml@tech9.net>,
	Michael Hohnbaum <hohnbaum@us.ibm.com>,
	Andrew Theurer <habanero@us.ibm.com>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	lse-tech <lse-tech@lists.sourceforge.net>,
	Erich Focht <efocht@ess.nec.de>, Ingo Molnar <mingo@elte.hu>
Subject: NUMA sched -> pooling scheduler (inc HT)
Date: Sat, 18 Jan 2003 12:54:21 -0800	[thread overview]
Message-ID: <550960000.1042923260@titus> (raw)
In-Reply-To: <270920000.1042822723@titus>

[-- Attachment #1: Type: text/plain, Size: 984 bytes --]

Andrew, hopefully this'll give you a cleaner integration point to do 
the HT scheduler stuff ... I basically did a rename of "node" to "pool" 
on sched.c (OK, it was a little more complex than that), and provided
some hooks for you to attatch to. There's a really hacky version of
the HT stuff in there that I doubt works at all. (sched.h will need
something other than CONFIG_SCHED_NUMA, for starters). 

It's not really finished, but I have to go out ... I thought you or 
someone else might like to have a play with it in the meantime. 
Goes on top of the second half of Ingo's stuff from yesterday 
(also attatched).

I think this should result in a much cleaner integration between the HT
aware stuff and the NUMA stuff. Pools is a concept Erich had in his
scheduler a while back, but it got set aside in the paring down for
integration. We should be able to add multiple levels to this fairly
easily, at some point (eg HT + NUMA), but let's get the basics working
first ;-)

M.

[-- Attachment #2: 01-ingo --]
[-- Type: application/octet-stream, Size: 6829 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 00-virgin/kernel/sched.c 01-ingo/kernel/sched.c
--- 00-virgin/kernel/sched.c	Fri Jan 17 09:18:32 2003
+++ 01-ingo/kernel/sched.c	Sat Jan 18 10:58:57 2003
@@ -153,10 +153,9 @@ struct runqueue {
 			nr_uninterruptible;
 	task_t *curr, *idle;
 	prio_array_t *active, *expired, arrays[2];
-	int prev_nr_running[NR_CPUS];
+	int prev_cpu_load[NR_CPUS];
 #ifdef CONFIG_NUMA
 	atomic_t *node_nr_running;
-	unsigned int nr_balanced;
 	int prev_node_load[MAX_NUMNODES];
 #endif
 	task_t *migration_thread;
@@ -765,29 +764,6 @@ static int find_busiest_node(int this_no
 	return node;
 }
 
-static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
-{
-	int this_node = __cpu_to_node(this_cpu);
-	/*
-	 * Avoid rebalancing between nodes too often.
-	 * We rebalance globally once every NODE_BALANCE_RATE load balances.
-	 */
-	if (++(this_rq->nr_balanced) == NODE_BALANCE_RATE) {
-		int node = find_busiest_node(this_node);
-		this_rq->nr_balanced = 0;
-		if (node >= 0)
-			return (__node_to_cpu_mask(node) | (1UL << this_cpu));
-	}
-	return __node_to_cpu_mask(this_node);
-}
-
-#else /* !CONFIG_NUMA */
-
-static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
-{
-	return cpu_online_map;
-}
-
 #endif /* CONFIG_NUMA */
 
 #if CONFIG_SMP
@@ -807,10 +783,10 @@ static inline unsigned int double_lock_b
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
 			/* Need to recalculate nr_running */
-			if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
+			if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
 				nr_running = this_rq->nr_running;
 			else
-				nr_running = this_rq->prev_nr_running[this_cpu];
+				nr_running = this_rq->prev_cpu_load[this_cpu];
 		} else
 			spin_lock(&busiest->lock);
 	}
@@ -847,10 +823,10 @@ static inline runqueue_t *find_busiest_q
 	 * that case we are less picky about moving a task across CPUs and
 	 * take what can be taken.
 	 */
-	if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu]))
+	if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu]))
 		nr_running = this_rq->nr_running;
 	else
-		nr_running = this_rq->prev_nr_running[this_cpu];
+		nr_running = this_rq->prev_cpu_load[this_cpu];
 
 	busiest = NULL;
 	max_load = 1;
@@ -859,11 +835,11 @@ static inline runqueue_t *find_busiest_q
 			continue;
 
 		rq_src = cpu_rq(i);
-		if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i]))
+		if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i]))
 			load = rq_src->nr_running;
 		else
-			load = this_rq->prev_nr_running[i];
-		this_rq->prev_nr_running[i] = rq_src->nr_running;
+			load = this_rq->prev_cpu_load[i];
+		this_rq->prev_cpu_load[i] = rq_src->nr_running;
 
 		if ((load > max_load) && (rq_src != this_rq)) {
 			busiest = rq_src;
@@ -922,7 +898,7 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle)
+static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -930,8 +906,7 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance,
-					cpus_to_balance(this_cpu, this_rq));
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
 	if (!busiest)
 		goto out;
 
@@ -1006,21 +981,75 @@ out:
  * frequency and balancing agressivity depends on whether the CPU is
  * idle or not.
  *
- * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on
+ * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
  * systems with HZ=100, every 10 msecs.)
+ *
+ * On NUMA, do a node-rebalance every 400 msecs.
  */
-#define BUSY_REBALANCE_TICK (HZ/4 ?: 1)
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
+#define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
+#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 2)
+#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
 
-static inline void idle_tick(runqueue_t *rq)
+#if CONFIG_NUMA
+static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
 {
-	if (jiffies % IDLE_REBALANCE_TICK)
-		return;
-	spin_lock(&rq->lock);
-	load_balance(rq, 1);
-	spin_unlock(&rq->lock);
+	int node = find_busiest_node(__cpu_to_node(this_cpu));
+	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
+
+	if (node >= 0) {
+		cpumask = __node_to_cpu_mask(node) | this_cpumask;
+		spin_lock(&this_rq->lock);
+		load_balance(this_rq, idle, cpumask);
+		spin_unlock(&this_rq->lock);
+	}
 }
+#endif
 
+static void rebalance_tick(runqueue_t *this_rq, int idle)
+{
+#if CONFIG_NUMA
+	int this_cpu = smp_processor_id();
+#endif
+	unsigned long j = jiffies;
+
+	/*
+	 * First do inter-node rebalancing, then intra-node rebalancing,
+	 * if both events happen in the same tick. The inter-node
+	 * rebalancing does not necessarily have to create a perfect
+	 * balance within the node, since we load-balance the most loaded
+	 * node with the current CPU. (ie. other CPUs in the local node
+	 * are not balanced.)
+	 */
+	if (idle) {
+#if CONFIG_NUMA
+		if (!(j % IDLE_NODE_REBALANCE_TICK))
+			balance_node(this_rq, idle, this_cpu);
+#endif
+		if (!(j % IDLE_REBALANCE_TICK)) {
+			spin_lock(&this_rq->lock);
+			load_balance(this_rq, 0, __cpu_to_node_mask(this_cpu));
+			spin_unlock(&this_rq->lock);
+		}
+		return;
+	}
+#if CONFIG_NUMA
+	if (!(j % BUSY_NODE_REBALANCE_TICK))
+		balance_node(this_rq, idle, this_cpu);
+#endif
+	if (!(j % BUSY_REBALANCE_TICK)) {
+		spin_lock(&this_rq->lock);
+		load_balance(this_rq, idle, __cpu_to_node_mask(this_cpu));
+		spin_unlock(&this_rq->lock);
+	}
+}
+#else
+/*
+ * on UP we do not need to balance between CPUs:
+ */
+static inline void rebalance_tick(runqueue_t *this_rq, int idle)
+{
+}
 #endif
 
 DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
@@ -1063,9 +1092,7 @@ void scheduler_tick(int user_ticks, int 
 			kstat_cpu(cpu).cpustat.iowait += sys_ticks;
 		else
 			kstat_cpu(cpu).cpustat.idle += sys_ticks;
-#if CONFIG_SMP
-		idle_tick(rq);
-#endif
+		rebalance_tick(rq, 1);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -1121,11 +1148,8 @@ void scheduler_tick(int user_ticks, int 
 			enqueue_task(p, rq->active);
 	}
 out:
-#if CONFIG_SMP
-	if (!(jiffies % BUSY_REBALANCE_TICK))
-		load_balance(rq, 0);
-#endif
 	spin_unlock(&rq->lock);
+	rebalance_tick(rq, 0);
 }
 
 void scheduling_functions_start_here(void) { }
@@ -1184,7 +1208,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #if CONFIG_SMP
-		load_balance(rq, 1);
+		load_balance(rq, 1, __cpu_to_node_mask(smp_processor_id()));
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

[-- Attachment #3: 02-pools --]
[-- Type: application/octet-stream, Size: 12912 bytes --]

diff -urpN -X /home/fletch/.diff.exclude 01-ingo/arch/i386/Kconfig 02-pools/arch/i386/Kconfig
--- 01-ingo/arch/i386/Kconfig	Fri Jan 17 09:18:19 2003
+++ 02-pools/arch/i386/Kconfig	Sat Jan 18 11:59:54 2003
@@ -476,6 +476,11 @@ config NUMA
 	bool "Numa Memory Allocation Support"
 	depends on X86_NUMAQ
 
+config SCHED_NUMA
+	bool "NUMA aware scheduler"
+	depends on NUMA
+	default y
+
 config DISCONTIGMEM
 	bool
 	depends on NUMA
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/arch/ia64/Kconfig 02-pools/arch/ia64/Kconfig
--- 01-ingo/arch/ia64/Kconfig	Thu Jan  9 19:15:56 2003
+++ 02-pools/arch/ia64/Kconfig	Sat Jan 18 12:00:08 2003
@@ -246,6 +246,11 @@ config DISCONTIGMEM
 	  or have huge holes in the physical address space for other reasons.
 	  See <file:Documentation/vm/numa> for more.
 
+config SCHED_NUMA
+	bool "NUMA aware scheduler"
+	depends on NUMA
+	default y
+
 config VIRTUAL_MEM_MAP
 	bool "Enable Virtual Mem Map"
 	depends on !NUMA
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched.h 02-pools/include/linux/sched.h
--- 01-ingo/include/linux/sched.h	Fri Jan 17 09:18:32 2003
+++ 02-pools/include/linux/sched.h	Sat Jan 18 12:21:09 2003
@@ -447,12 +447,12 @@ extern void set_cpus_allowed(task_t *p, 
 # define set_cpus_allowed(p, new_mask) do { } while (0)
 #endif
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 extern void sched_balance_exec(void);
-extern void node_nr_running_init(void);
+extern void pool_nr_running_init(void);
 #else
 #define sched_balance_exec()   {}
-#define node_nr_running_init() {}
+#define pool_nr_running_init() {}
 #endif
 
 extern void set_user_nice(task_t *p, long nice);
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched_topo_ht.h 02-pools/include/linux/sched_topo_ht.h
--- 01-ingo/include/linux/sched_topo_ht.h	Wed Dec 31 16:00:00 1969
+++ 02-pools/include/linux/sched_topo_ht.h	Sat Jan 18 12:20:00 2003
@@ -0,0 +1,17 @@
+#define CONFIG_SCHED_POOLS 1               /* should be a real config option */
+
+/* 
+ * The following is a temporary hack, for which I make no apologies - mbligh
+ * Assumes CPUs are paired together siblings (0,1) (2,3) (4,5) .... etc.
+ * We should probably do this in an arch topo file and use apicids.
+ */
+
+#define MAX_NUMPOOLS NR_CPUS
+#define numpools (num_online_cpus / 2)
+
+#define pool_to_cpu_mask(pool)	( (1UL << (pool*2)) || (1UL << (pool*2+1)) )
+#define cpu_to_pool(cpu)	(cpu / 2)
+#define cpu_to_pool_mask(cpu)	(pool_to_cpu_mask(cpu_to_pool(cpu)))
+
+#define IDLE_REBALANCE_RATIO 2
+#define BUSY_REBALANCE_RATIO 2
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched_topo_numa.h 02-pools/include/linux/sched_topo_numa.h
--- 01-ingo/include/linux/sched_topo_numa.h	Wed Dec 31 16:00:00 1969
+++ 02-pools/include/linux/sched_topo_numa.h	Sat Jan 18 12:20:05 2003
@@ -0,0 +1,11 @@
+#define CONFIG_SCHED_POOLS 1               /* should be a real config option */
+
+#define MAX_NUMPOOLS MAX_NUMNODES
+#define numpools numnodes
+
+#define pool_to_cpu_mask	__node_to_cpu_mask
+#define cpu_to_pool		__cpu_to_node
+#define cpu_to_pool_mask(cpu)	(__node_to_cpu_mask(__cpu_to_node(cpu)))
+
+#define IDLE_REBALANCE_RATIO 10
+#define BUSY_REBALANCE_RATIO 5
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/include/linux/sched_topology.h 02-pools/include/linux/sched_topology.h
--- 01-ingo/include/linux/sched_topology.h	Wed Dec 31 16:00:00 1969
+++ 02-pools/include/linux/sched_topology.h	Sat Jan 18 11:59:36 2003
@@ -0,0 +1,14 @@
+#ifndef _LINUX_SCHED_TOPOLOGY_H
+#define _LINUX_SCHED_TOPOLOGY_H
+
+#ifdef CONFIG_SCHED_TOPO_ARCH
+#include <asm/sched_topo.h>
+#elif CONFIG_SCHED_NUMA
+#include <linux/sched_topo_numa.h>
+#elif CONFIG_SCHED_TOPO_HT
+#include <linux/sched_topo_ht.h>
+#else
+#include <linux/sched_topo_flat.h>
+#endif
+
+#endif /* _LINUX_SCHED_TOPOLOGY_H */
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/init/main.c 02-pools/init/main.c
--- 01-ingo/init/main.c	Fri Jan 17 09:18:32 2003
+++ 02-pools/init/main.c	Sat Jan 18 11:48:10 2003
@@ -495,7 +495,7 @@ static void do_pre_smp_initcalls(void)
 
 	migration_init();
 #endif
-	node_nr_running_init();
+	pool_nr_running_init();
 	spawn_ksoftirqd();
 }
 
diff -urpN -X /home/fletch/.diff.exclude 01-ingo/kernel/sched.c 02-pools/kernel/sched.c
--- 01-ingo/kernel/sched.c	Sat Jan 18 10:58:57 2003
+++ 02-pools/kernel/sched.c	Sat Jan 18 11:49:00 2003
@@ -32,6 +32,7 @@
 #include <linux/delay.h>
 #include <linux/timer.h>
 #include <linux/rcupdate.h>
+#include <linux/sched_topology.h>
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -67,7 +68,7 @@
 #define INTERACTIVE_DELTA	2
 #define MAX_SLEEP_AVG		(2*HZ)
 #define STARVATION_LIMIT	(2*HZ)
-#define NODE_THRESHOLD          125
+#define POOL_THRESHOLD          125
 
 /*
  * If a task is 'interactive' then we reinsert it in the active
@@ -154,9 +155,9 @@ struct runqueue {
 	task_t *curr, *idle;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
-#ifdef CONFIG_NUMA
-	atomic_t *node_nr_running;
-	int prev_node_load[MAX_NUMNODES];
+#ifdef CONFIG_SCHED_POOLS
+	atomic_t *pool_nr_running;
+	int prev_pool_load[MAX_NUMPOOLS];
 #endif
 	task_t *migration_thread;
 	struct list_head migration_queue;
@@ -181,47 +182,47 @@ static struct runqueue runqueues[NR_CPUS
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_POOLS
 
 /*
  * Keep track of running tasks.
  */
 
-static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
-	{[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
+static atomic_t pool_nr_running[MAX_NUMPOOLS] ____cacheline_maxaligned_in_smp =
+	{[0 ...MAX_NUMPOOLS-1] = ATOMIC_INIT(0)};
 
 static inline void nr_running_init(struct runqueue *rq)
 {
-	rq->node_nr_running = &node_nr_running[0];
+	rq->pool_nr_running = &pool_nr_running[0];
 }
 
 static inline void nr_running_inc(runqueue_t *rq)
 {
-	atomic_inc(rq->node_nr_running);
+	atomic_inc(rq->pool_nr_running);
 	rq->nr_running++;
 }
 
 static inline void nr_running_dec(runqueue_t *rq)
 {
-	atomic_dec(rq->node_nr_running);
+	atomic_dec(rq->pool_nr_running);
 	rq->nr_running--;
 }
 
-__init void node_nr_running_init(void)
+__init void pool_nr_running_init(void)
 {
 	int i;
 
 	for (i = 0; i < NR_CPUS; i++)
-		cpu_rq(i)->node_nr_running = &node_nr_running[__cpu_to_node(i)];
+		cpu_rq(i)->pool_nr_running = &pool_nr_running[cpu_to_pool(i)];
 }
 
-#else /* !CONFIG_NUMA */
+#else /* !CONFIG_SCHED_POOLS */
 
 # define nr_running_init(rq)   do { } while (0)
 # define nr_running_inc(rq)    do { (rq)->nr_running++; } while (0)
 # define nr_running_dec(rq)    do { (rq)->nr_running--; } while (0)
 
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SCHED_POOLS */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -670,7 +671,7 @@ static inline void double_rq_unlock(runq
 		spin_unlock(&rq2->lock);
 }
 
-#if CONFIG_NUMA
+#if CONFIG_SCHED_POOLS
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -697,7 +698,7 @@ static void sched_migrate_task(task_t *p
  */
 static int sched_best_cpu(struct task_struct *p)
 {
-	int i, minload, load, best_cpu, node = 0;
+	int i, minload, load, best_cpu, pool = 0;
 	unsigned long cpumask;
 
 	best_cpu = task_cpu(p);
@@ -705,16 +706,16 @@ static int sched_best_cpu(struct task_st
 		return best_cpu;
 
 	minload = 10000000;
-	for (i = 0; i < numnodes; i++) {
-		load = atomic_read(&node_nr_running[i]);
+	for (i = 0; i < numpools; i++) {
+		load = atomic_read(&pool_nr_running[i]);
 		if (load < minload) {
 			minload = load;
-			node = i;
+			pool = i;
 		}
 	}
 
 	minload = 10000000;
-	cpumask = __node_to_cpu_mask(node);
+	cpumask = pool_to_cpu_mask(pool);
 	for (i = 0; i < NR_CPUS; ++i) {
 		if (!(cpumask & (1UL << i)))
 			continue;
@@ -730,7 +731,7 @@ void sched_balance_exec(void)
 {
 	int new_cpu;
 
-	if (numnodes > 1) {
+	if (numpools > 1) {
 		new_cpu = sched_best_cpu(current);
 		if (new_cpu != smp_processor_id())
 			sched_migrate_task(current, new_cpu);
@@ -738,33 +739,33 @@ void sched_balance_exec(void)
 }
 
 /*
- * Find the busiest node. All previous node loads contribute with a 
+ * Find the busiest pool. All previous pool loads contribute with a 
  * geometrically deccaying weight to the load measure:
- *      load_{t} = load_{t-1}/2 + nr_node_running_{t}
+ *      load_{t} = load_{t-1}/2 + nr_pool_running_{t}
  * This way sudden load peaks are flattened out a bit.
  */
-static int find_busiest_node(int this_node)
+static int find_busiest_pool(int this_pool)
 {
-	int i, node = -1, load, this_load, maxload;
+	int i, pool = -1, load, this_load, maxload;
 	
-	this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1)
-		+ atomic_read(&node_nr_running[this_node]);
-	this_rq()->prev_node_load[this_node] = this_load;
-	for (i = 0; i < numnodes; i++) {
-		if (i == this_node)
+	this_load = maxload = (this_rq()->prev_pool_load[this_pool] >> 1)
+		+ atomic_read(&pool_nr_running[this_pool]);
+	this_rq()->prev_pool_load[this_pool] = this_load;
+	for (i = 0; i < numpools; i++) {
+		if (i == this_pool)
 			continue;
-		load = (this_rq()->prev_node_load[i] >> 1)
-			+ atomic_read(&node_nr_running[i]);
-		this_rq()->prev_node_load[i] = load;
-		if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) {
+		load = (this_rq()->prev_pool_load[i] >> 1)
+			+ atomic_read(&pool_nr_running[i]);
+		this_rq()->prev_pool_load[i] = load;
+		if (load > maxload && (100*load > POOL_THRESHOLD*this_load)) {
 			maxload = load;
-			node = i;
+			pool = i;
 		}
 	}
-	return node;
+	return pool;
 }
 
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SCHED_POOLS */
 
 #if CONFIG_SMP
 
@@ -983,22 +984,20 @@ out:
  *
  * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on
  * systems with HZ=100, every 10 msecs.)
- *
- * On NUMA, do a node-rebalance every 400 msecs.
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 2)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
+#define IDLE_POOL_REBALANCE_TICK (IDLE_REBALANCE_TICK * IDLE_REBALANCE_RATIO)
+#define BUSY_POOL_REBALANCE_TICK (BUSY_REBALANCE_TICK * BUSY_REBALANCE_RATIO)
 
-#if CONFIG_NUMA
-static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
+#if CONFIG_SCHED_POOLS
+static void balance_pool(runqueue_t *this_rq, int idle, int this_cpu)
 {
-	int node = find_busiest_node(__cpu_to_node(this_cpu));
+	int pool = find_busiest_pool(cpu_to_pool(this_cpu));
 	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
 
-	if (node >= 0) {
-		cpumask = __node_to_cpu_mask(node) | this_cpumask;
+	if (pool >= 0) {
+		cpumask = pool_to_cpu_mask(pool) | this_cpumask;
 		spin_lock(&this_rq->lock);
 		load_balance(this_rq, idle, cpumask);
 		spin_unlock(&this_rq->lock);
@@ -1008,38 +1007,38 @@ static void balance_node(runqueue_t *thi
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#if CONFIG_NUMA
+#if CONFIG_SCHED_POOLS
 	int this_cpu = smp_processor_id();
 #endif
 	unsigned long j = jiffies;
 
 	/*
-	 * First do inter-node rebalancing, then intra-node rebalancing,
-	 * if both events happen in the same tick. The inter-node
+	 * First do inter-pool rebalancing, then intra-pool rebalancing,
+	 * if both events happen in the same tick. The inter-pool
 	 * rebalancing does not necessarily have to create a perfect
-	 * balance within the node, since we load-balance the most loaded
-	 * node with the current CPU. (ie. other CPUs in the local node
+	 * balance within the pool, since we load-balance the most loaded
+	 * pool with the current CPU. (ie. other CPUs in the local pool
 	 * are not balanced.)
 	 */
 	if (idle) {
-#if CONFIG_NUMA
-		if (!(j % IDLE_NODE_REBALANCE_TICK))
-			balance_node(this_rq, idle, this_cpu);
+#if CONFIG_SCHED_POOLS
+		if (!(j % IDLE_POOL_REBALANCE_TICK))
+			balance_pool(this_rq, idle, this_cpu);
 #endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
-			load_balance(this_rq, 0, __cpu_to_node_mask(this_cpu));
+			load_balance(this_rq, 0, cpu_to_pool_mask(this_cpu));
 			spin_unlock(&this_rq->lock);
 		}
 		return;
 	}
-#if CONFIG_NUMA
-	if (!(j % BUSY_NODE_REBALANCE_TICK))
-		balance_node(this_rq, idle, this_cpu);
+#if CONFIG_SCHED_POOLS
+	if (!(j % BUSY_POOL_REBALANCE_TICK))
+		balance_pool(this_rq, idle, this_cpu);
 #endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, __cpu_to_node_mask(this_cpu));
+		load_balance(this_rq, idle, cpu_to_pool_mask(this_cpu));
 		spin_unlock(&this_rq->lock);
 	}
 }
@@ -1208,7 +1207,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #if CONFIG_SMP
-		load_balance(rq, 1, __cpu_to_node_mask(smp_processor_id()));
+		load_balance(rq, 1, cpu_to_pool_mask(smp_processor_id()));
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

  reply	other threads:[~2003-01-18 20:46 UTC|newest]

Thread overview: 96+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-01-09 23:54 Minature NUMA scheduler Martin J. Bligh
2003-01-10  5:36 ` [Lse-tech] " Michael Hohnbaum
2003-01-10 16:34   ` Erich Focht
2003-01-10 16:57     ` Martin J. Bligh
2003-01-12 23:35       ` Erich Focht
2003-01-12 23:55       ` NUMA scheduler 2nd approach Erich Focht
2003-01-13  8:02         ` Christoph Hellwig
2003-01-13 11:32           ` Erich Focht
2003-01-13 15:26             ` [Lse-tech] " Christoph Hellwig
2003-01-13 15:46               ` Erich Focht
2003-01-13 19:03             ` Michael Hohnbaum
2003-01-14  1:23         ` Michael Hohnbaum
2003-01-14  4:45           ` [Lse-tech] " Andrew Theurer
2003-01-14  4:56             ` Martin J. Bligh
2003-01-14 11:14               ` Erich Focht
2003-01-14 15:55                 ` [PATCH 2.5.58] new NUMA scheduler Erich Focht
2003-01-14 16:07                   ` [Lse-tech] " Christoph Hellwig
2003-01-14 16:23                   ` [PATCH 2.5.58] new NUMA scheduler: fix Erich Focht
2003-01-14 16:43                     ` Erich Focht
2003-01-14 19:02                       ` Michael Hohnbaum
2003-01-14 21:56                         ` [Lse-tech] " Michael Hohnbaum
2003-01-15 15:10                         ` Erich Focht
2003-01-16  0:14                           ` Michael Hohnbaum
2003-01-16  6:05                           ` Martin J. Bligh
2003-01-16 16:47                             ` Erich Focht
2003-01-16 18:07                               ` Robert Love
2003-01-16 18:48                                 ` Martin J. Bligh
2003-01-16 19:07                                 ` Ingo Molnar
2003-01-16 18:59                                   ` Martin J. Bligh
2003-01-16 19:10                                   ` Christoph Hellwig
2003-01-16 19:44                                     ` Ingo Molnar
2003-01-16 19:43                                       ` Martin J. Bligh
2003-01-16 20:19                                         ` Ingo Molnar
2003-01-16 20:29                                           ` [Lse-tech] " Rick Lindsley
2003-01-16 23:31                                           ` Martin J. Bligh
2003-01-17  7:23                                             ` Ingo Molnar
2003-01-17  8:47                                             ` [patch] sched-2.5.59-A2 Ingo Molnar
2003-01-17 14:35                                               ` Erich Focht
2003-01-17 15:11                                                 ` Ingo Molnar
2003-01-17 15:30                                                   ` Erich Focht
2003-01-17 16:58                                                   ` Martin J. Bligh
2003-01-18 20:54                                                     ` Martin J. Bligh [this message]
2003-01-18 21:34                                                       ` [Lse-tech] NUMA sched -> pooling scheduler (inc HT) Martin J. Bligh
2003-01-19  0:13                                                         ` Andrew Theurer
2003-01-17 18:19                                                   ` [patch] sched-2.5.59-A2 Michael Hohnbaum
2003-01-18  7:08                                                   ` William Lee Irwin III
2003-01-18  8:12                                                     ` Martin J. Bligh
2003-01-18  8:16                                                       ` William Lee Irwin III
2003-01-19  4:22                                                     ` William Lee Irwin III
2003-01-17 17:21                                                 ` Martin J. Bligh
2003-01-17 17:23                                                 ` Martin J. Bligh
2003-01-17 18:11                                                 ` Erich Focht
2003-01-17 19:04                                                   ` Martin J. Bligh
2003-01-17 19:26                                                     ` [Lse-tech] " Martin J. Bligh
2003-01-18  0:13                                                       ` Michael Hohnbaum
2003-01-18 13:31                                                         ` [patch] tunable rebalance rates for sched-2.5.59-B0 Erich Focht
2003-01-18 23:09                                                         ` [patch] sched-2.5.59-A2 Erich Focht
2003-01-20  9:28                                                           ` Ingo Molnar
2003-01-20 12:07                                                             ` Erich Focht
2003-01-20 16:56                                                               ` Ingo Molnar
2003-01-20 17:04                                                                 ` Ingo Molnar
2003-01-20 17:10                                                                   ` Martin J. Bligh
2003-01-20 17:24                                                                     ` Ingo Molnar
2003-01-20 19:13                                                                       ` Andrew Theurer
2003-01-20 19:33                                                                         ` Martin J. Bligh
2003-01-20 19:52                                                                           ` Andrew Theurer
2003-01-20 19:52                                                                             ` Martin J. Bligh
2003-01-20 21:18                                                                               ` [patch] HT scheduler, sched-2.5.59-D7 Ingo Molnar
2003-01-20 22:28                                                                                 ` Andrew Morton
2003-01-21  1:11                                                                                   ` Michael Hohnbaum
2003-01-22  3:15                                                                                 ` Michael Hohnbaum
2003-01-22 16:41                                                                                   ` Andrew Theurer
2003-01-22 16:17                                                                                     ` Martin J. Bligh
2003-01-22 16:20                                                                                       ` Andrew Theurer
2003-01-22 16:35                                                                                     ` Michael Hohnbaum
2003-02-03 18:23                                                                                 ` [patch] HT scheduler, sched-2.5.59-E2 Ingo Molnar
2003-02-03 20:47                                                                                   ` Robert Love
2003-02-04  9:31                                                                                   ` Erich Focht
2003-01-20 17:04                                                                 ` [patch] sched-2.5.59-A2 Martin J. Bligh
2003-01-21 17:44                                                                 ` Erich Focht
2003-01-20 16:23                                                             ` Martin J. Bligh
2003-01-20 16:59                                                               ` Ingo Molnar
2003-01-17 23:09                                                     ` Matthew Dobson
2003-01-16 23:45                                           ` [PATCH 2.5.58] new NUMA scheduler: fix Michael Hohnbaum
2003-01-17 11:10                                           ` Erich Focht
2003-01-17 14:07                                             ` Ingo Molnar
2003-01-16 19:44                                       ` John Bradford
2003-01-14 16:51                     ` Christoph Hellwig
2003-01-15  0:05                     ` Michael Hohnbaum
2003-01-15  7:47                     ` Martin J. Bligh
2003-01-14  5:50             ` [Lse-tech] Re: NUMA scheduler 2nd approach Michael Hohnbaum
2003-01-14 16:52               ` Andrew Theurer
2003-01-14 15:13                 ` Erich Focht
2003-01-14 10:56           ` Erich Focht
2003-01-11 14:43     ` [Lse-tech] Minature NUMA scheduler Bill Davidsen
2003-01-12 23:24       ` Erich Focht

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=550960000.1042923260@titus \
    --to=mbligh@aracnet.com \
    --cc=efocht@ess.nec.de \
    --cc=habanero@us.ibm.com \
    --cc=hch@infradead.org \
    --cc=hohnbaum@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lse-tech@lists.sourceforge.net \
    --cc=mingo@elte.hu \
    --cc=rml@tech9.net \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.