All of lore.kernel.org
 help / color / mirror / Atom feed
From: Erich Focht <efocht@ess.nec.de>
To: "Martin J. Bligh" <mbligh@aracnet.com>,
	Michael Hohnbaum <hohnbaum@us.ibm.com>
Cc: Robert Love <rml@tech9.net>, Ingo Molnar <mingo@elte.hu>,
	Stephen Hemminger <shemminger@osdl.org>,
	linux-kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH 2.5.53] NUMA scheduler (2/3)
Date: Tue, 31 Dec 2002 14:29:55 +0100	[thread overview]
Message-ID: <200212311429.55142.efocht@ess.nec.de> (raw)
In-Reply-To: <200212311429.04382.efocht@ess.nec.de>

[-- Attachment #1: Type: text/plain, Size: 918 bytes --]

... the second patch ...

On Tuesday 31 December 2002 14:29, Erich Focht wrote:
> Here comes the minimal NUMA scheduler built on top of the O(1) load
> balancer rediffed for 2.5.53 with some changes in the core part. As
> suggested by Michael, I added the cputimes_stat patch, as it is
> absolutely needed for understanding the scheduler behavior.
>
> The three patches:
> 01-numa-sched-core-2.5.53-24.patch: core NUMA scheduler infrastructure
>   providing a node aware load_balancer. Cosmetic changes + more comments.
>
> 02-numa-sched-ilb-2.5.53-21.patch: initial load balancing, selects
>   least loaded node & CPU at exec().
>
> 03-cputimes_stat-2.5.53.patch: adds back to the kernel per CPU user
>   and system time statistics for each process in /proc/PID/cpu. Needed
>   for evaluating scheduler behavior and performance of tasks running
>   on SMP and NUMA systems.
>
> Regards,
> Erich

[-- Attachment #2: 02-numa-sched-ilb-2.5.53-21.patch --]
[-- Type: text/x-diff, Size: 4878 bytes --]

diff -urN a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c	2002-12-24 06:20:03.000000000 +0100
+++ b/fs/exec.c	2002-12-31 14:15:45.000000000 +0100
@@ -1024,6 +1024,8 @@
 	int retval;
 	int i;
 
+	sched_balance_exec();
+
 	file = open_exec(filename);
 
 	retval = PTR_ERR(file);
diff -urN a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h	2002-12-31 13:02:18.000000000 +0100
+++ b/include/linux/sched.h	2002-12-31 14:17:13.000000000 +0100
@@ -160,7 +160,6 @@
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
 
-
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);
@@ -447,8 +446,18 @@
 
 #ifdef CONFIG_NUMA
 extern void build_node_data(void);
+extern void sched_balance_exec(void);
+extern void node_nr_running_init(void);
+#define nr_running_inc(rq) atomic_inc(rq->node_ptr); \
+	rq->nr_running++
+#define nr_running_dec(rq) atomic_dec(rq->node_ptr); \
+	rq->nr_running--
 #else
 #define build_node_data() {}
+#define sched_balance_exec() {}
+#define node_nr_running_init() {}
+#define nr_running_inc(rq) rq->nr_running++
+#define nr_running_dec(rq) rq->nr_running--
 #endif
 
 extern void set_user_nice(task_t *p, long nice);
diff -urN a/init/main.c b/init/main.c
--- a/init/main.c	2002-12-24 06:19:46.000000000 +0100
+++ b/init/main.c	2002-12-31 14:15:45.000000000 +0100
@@ -489,6 +489,7 @@
 
 	migration_init();
 #endif
+	node_nr_running_init();
 	spawn_ksoftirqd();
 }
 
diff -urN a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	2002-12-31 13:46:03.000000000 +0100
+++ b/kernel/sched.c	2002-12-31 14:15:45.000000000 +0100
@@ -153,6 +153,7 @@
 	task_t *curr, *idle;
 	prio_array_t *active, *expired, arrays[2];
 	int prev_nr_running[NR_CPUS];
+	atomic_t * node_ptr;
 
 	task_t *migration_thread;
 	struct list_head migration_queue;
@@ -356,7 +357,7 @@
 		p->prio = effective_prio(p);
 	}
 	enqueue_task(p, array);
-	rq->nr_running++;
+	nr_running_inc(rq);
 }
 
 /*
@@ -364,7 +365,7 @@
  */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	nr_running_dec(rq);
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
@@ -851,9 +852,9 @@
 static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	nr_running_dec(src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	nr_running_inc(this_rq);
 	enqueue_task(p, this_rq->active);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -2293,6 +2294,83 @@
 
 #endif
 
+#if CONFIG_NUMA
+static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
+
+__init void node_nr_running_init(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		cpu_rq(i)->node_ptr = &node_nr_running[__cpu_to_node(i)];
+	}
+	return;
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(task_t *p, int dest_cpu)
+{
+	unsigned long old_mask;
+
+	old_mask = p->cpus_allowed;
+	if (!(old_mask & (1UL << dest_cpu)))
+		return;
+	/* force the process onto the specified CPU */
+	set_cpus_allowed(p, 1UL << dest_cpu);
+
+	/* restore the cpus allowed mask */
+	set_cpus_allowed(p, old_mask);
+}
+
+/*
+ * Find the least loaded CPU.  Slightly favor the current CPU by
+ * setting its runqueue length as the minimum to start.
+ */
+static int sched_best_cpu(struct task_struct *p)
+{
+	int i, minload, load, best_cpu, node = 0;
+
+	best_cpu = task_cpu(p);
+	if (cpu_rq(best_cpu)->nr_running <= 2)
+		return best_cpu;
+
+	minload = 10000000;
+	for (i = 0; i < numnodes; i++) {
+		load = atomic_read(&node_nr_running[i]);
+		if (load < minload) {
+			minload = load;
+			node = i;
+		}
+	}
+	minload = 10000000;
+	loop_over_node(i,node) {
+		if (!cpu_online(i))
+			continue;
+		if (cpu_rq(i)->nr_running < minload) {
+			best_cpu = i;
+			minload = cpu_rq(i)->nr_running;
+		}
+	}
+	return best_cpu;
+}
+
+void sched_balance_exec(void)
+{
+	int new_cpu;
+
+	if (numnodes > 1) {
+		new_cpu = sched_best_cpu(current);
+		if (new_cpu != smp_processor_id())
+			sched_migrate_task(current, new_cpu);
+	}
+}
+#endif /* CONFIG_NUMA */
+
 #if CONFIG_SMP || CONFIG_PREEMPT
 /*
  * The 'big kernel lock'
@@ -2354,6 +2432,10 @@
 		spin_lock_init(&rq->lock);
 		INIT_LIST_HEAD(&rq->migration_queue);
 		atomic_set(&rq->nr_iowait, 0);
+#if CONFIG_NUMA
+		rq->node_ptr = &node_nr_running[0];
+#endif /* CONFIG_NUMA */
+
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;

  reply	other threads:[~2002-12-31 13:21 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-11-06 16:34 NUMA scheduler BK tree Erich Focht
2002-11-06 18:10 ` Michael Hohnbaum
2002-11-07 23:05   ` Erich Focht
2002-11-07 23:46 ` Michael Hohnbaum
2002-11-08 16:57   ` Erich Focht
2002-11-11 15:13 ` [PATCH 2.5.47] NUMA scheduler (1/2) Erich Focht
2002-11-11 15:14   ` [PATCH 2.5.47] NUMA scheduler (2/2) Erich Focht
2002-11-12  0:24   ` [PATCH 2.5.47] NUMA scheduler (1/2) Michael Hohnbaum
2002-11-18 19:40 ` NUMA scheduler BK tree Martin J. Bligh
2002-11-19 16:26   ` [PATCH 2.5.48] NUMA scheduler (1/2) Erich Focht
2002-11-19 16:27     ` [PATCH 2.5.48] NUMA scheduler (2/2) Erich Focht
2002-12-02 15:29     ` [PATCH 2.5.50] NUMA scheduler (1/2) Erich Focht
2002-12-02 15:30       ` [PATCH 2.5.50] NUMA scheduler (2/2) Erich Focht
2002-12-06 17:39       ` [PATCH 2.5.50] NUMA scheduler (1/2) Michael Hohnbaum
2002-12-18 16:21       ` [PATCH 2.5.52] " Erich Focht
2002-12-18 16:23         ` [PATCH 2.5.52] NUMA scheduler (2/2) Erich Focht
2002-12-20 14:49         ` [PATCH 2.5.52] NUMA scheduler: cputimes stats Erich Focht
2002-12-20 15:17         ` [PATCH 2.5.52] NUMA scheduler (1/2) Christoph Hellwig
2002-12-20 17:44           ` Erich Focht
2002-12-31 13:29         ` [PATCH 2.5.53] NUMA scheduler (1/3) Erich Focht
2002-12-31 13:29           ` Erich Focht [this message]
2002-12-31 13:30           ` [PATCH 2.5.53] NUMA scheduler (3/3) Erich Focht
2003-01-04  1:58           ` [PATCH 2.5.53] NUMA scheduler (1/3) Michael Hohnbaum
2003-01-05  5:35             ` Martin J. Bligh
2003-01-06  3:58               ` Michael Hohnbaum
2003-01-06  6:07                 ` Martin J. Bligh
2003-01-07  2:23                   ` Michael Hohnbaum
2003-01-07 11:27                     ` Erich Focht
2003-01-07 23:35                       ` Michael Hohnbaum

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200212311429.55142.efocht@ess.nec.de \
    --to=efocht@ess.nec.de \
    --cc=hohnbaum@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mbligh@aracnet.com \
    --cc=mingo@elte.hu \
    --cc=rml@tech9.net \
    --cc=shemminger@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.