From: Erich Focht <efocht@ess.nec.de>
To: "Martin J. Bligh" <mbligh@aracnet.com>,
Michael Hohnbaum <hohnbaum@us.ibm.com>
Cc: Robert Love <rml@tech9.net>, Anton Blanchard <anton@samba.org>,
Ingo Molnar <mingo@elte.hu>,
Stephen Hemminger <shemminger@osdl.org>,
linux-kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH 2.5.48] NUMA scheduler (2/2)
Date: Tue, 19 Nov 2002 17:27:23 +0100 [thread overview]
Message-ID: <200211191727.23308.efocht@ess.nec.de> (raw)
In-Reply-To: <200211191726.07214.efocht@ess.nec.de>
[-- Attachment #1: Type: text/plain, Size: 850 bytes --]
And here is the the second patch...
Erich
On Tuesday 19 November 2002 17:26, Erich Focht wrote:
> As requested in another email, I attach the 2.5.48 patches for the
> NUMA scheduler which emerged from Michael's and my work:
>
> 01-numa-sched-core-2.5.48-22.patch: core NUMA scheduler infrastructure
> providing a node aware load_balancer. Rediffed and fixed one
> external declaration.
>
> 02-numa-sched-ilb-2.5.48-21.patch: initial load balancing, selects
> least loaded node & CPU at exec(). Rediffed.
>
> We are curious about any benchmark results and, of course, about running
> on other platforms than NUMAQ & Azusa/TX7, too.
>
> Regards,
> Erich
>
> On Monday 18 November 2002 20:40, Martin J. Bligh wrote:
> > BTW, can you keep producing normal patches too, when you do an update?
> > I don't use bitkeeper ...
[-- Attachment #2: 02-numa-sched-ilb-2.5.48-21.patch --]
[-- Type: text/x-diff, Size: 4660 bytes --]
diff -Nru a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c Tue Nov 19 17:09:02 2002
+++ b/fs/exec.c Tue Nov 19 17:09:02 2002
@@ -1023,6 +1023,8 @@
int retval;
int i;
+ sched_balance_exec();
+
file = open_exec(filename);
retval = PTR_ERR(file);
diff -Nru a/include/linux/sched.h b/include/linux/sched.h
--- a/include/linux/sched.h Tue Nov 19 17:09:02 2002
+++ b/include/linux/sched.h Tue Nov 19 17:09:02 2002
@@ -159,7 +159,19 @@
unsigned long system, int cpu);
extern void scheduler_tick(int user_tick, int system);
extern unsigned long cache_decay_ticks;
-
+#ifdef CONFIG_NUMA
+extern void sched_balance_exec(void);
+extern void node_nr_running_init(void);
+#define nr_running_inc(rq) atomic_inc(rq->node_ptr); \
+ rq->nr_running++
+#define nr_running_dec(rq) atomic_dec(rq->node_ptr); \
+ rq->nr_running--
+#else
+#define sched_balance_exec() {}
+#define node_nr_running_init() {}
+#define nr_running_inc(rq) rq->nr_running++
+#define nr_running_dec(rq) rq->nr_running--
+#endif
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
extern signed long FASTCALL(schedule_timeout(signed long timeout));
diff -Nru a/init/main.c b/init/main.c
--- a/init/main.c Tue Nov 19 17:09:02 2002
+++ b/init/main.c Tue Nov 19 17:09:02 2002
@@ -500,6 +500,7 @@
migration_init();
#endif
+ node_nr_running_init();
spawn_ksoftirqd();
}
diff -Nru a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c Tue Nov 19 17:09:02 2002
+++ b/kernel/sched.c Tue Nov 19 17:09:02 2002
@@ -153,6 +153,7 @@
task_t *curr, *idle;
prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS];
+ atomic_t * node_ptr;
task_t *migration_thread;
struct list_head migration_queue;
@@ -346,7 +347,7 @@
p->prio = effective_prio(p);
}
enqueue_task(p, array);
- rq->nr_running++;
+ nr_running_inc(rq);
}
/*
@@ -354,7 +355,7 @@
*/
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{
- rq->nr_running--;
+ nr_running_dec(rq);
if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
dequeue_task(p, p->array);
@@ -841,9 +842,9 @@
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
{
dequeue_task(p, src_array);
- src_rq->nr_running--;
+ nr_running_dec(src_rq);
set_task_cpu(p, this_cpu);
- this_rq->nr_running++;
+ nr_running_inc(this_rq);
enqueue_task(p, this_rq->active);
/*
* Note that idle threads have a prio of MAX_PRIO, for this test
@@ -2253,6 +2254,83 @@
#endif
+#if CONFIG_NUMA
+static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
+
+__init void node_nr_running_init(void)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ cpu_rq(i)->node_ptr = &node_nr_running[__cpu_to_node(i)];
+ }
+ return;
+}
+
+/*
+ * If dest_cpu is allowed for this process, migrate the task to it.
+ * This is accomplished by forcing the cpu_allowed mask to only
+ * allow dest_cpu, which will force the cpu onto dest_cpu. Then
+ * the cpu_allowed mask is restored.
+ */
+static void sched_migrate_task(task_t *p, int dest_cpu)
+{
+ unsigned long old_mask;
+
+ old_mask = p->cpus_allowed;
+ if (!(old_mask & (1UL << dest_cpu)))
+ return;
+ /* force the process onto the specified CPU */
+ set_cpus_allowed(p, 1UL << dest_cpu);
+
+ /* restore the cpus allowed mask */
+ set_cpus_allowed(p, old_mask);
+}
+
+/*
+ * Find the least loaded CPU. Slightly favor the current CPU by
+ * setting its runqueue length as the minimum to start.
+ */
+static int sched_best_cpu(struct task_struct *p)
+{
+ int i, minload, load, best_cpu, node = 0;
+
+ best_cpu = task_cpu(p);
+ if (cpu_rq(best_cpu)->nr_running <= 2)
+ return best_cpu;
+
+ minload = 10000000;
+ for (i = 0; i < numnodes; i++) {
+ load = atomic_read(&node_nr_running[i]);
+ if (load < minload) {
+ minload = load;
+ node = i;
+ }
+ }
+ minload = 10000000;
+ loop_over_node(i,node) {
+ if (!cpu_online(i))
+ continue;
+ if (cpu_rq(i)->nr_running < minload) {
+ best_cpu = i;
+ minload = cpu_rq(i)->nr_running;
+ }
+ }
+ return best_cpu;
+}
+
+void sched_balance_exec(void)
+{
+ int new_cpu;
+
+ if (numnodes > 1) {
+ new_cpu = sched_best_cpu(current);
+ if (new_cpu != smp_processor_id())
+ sched_migrate_task(current, new_cpu);
+ }
+}
+#endif /* CONFIG_NUMA */
+
#if CONFIG_SMP || CONFIG_PREEMPT
/*
* The 'big kernel lock'
@@ -2314,6 +2392,10 @@
spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->migration_queue);
atomic_set(&rq->nr_iowait, 0);
+#if CONFIG_NUMA
+ rq->node_ptr = &node_nr_running[0];
+#endif /* CONFIG_NUMA */
+
for (j = 0; j < 2; j++) {
array = rq->arrays + j;
next prev parent reply other threads:[~2002-11-19 16:21 UTC|newest]
Thread overview: 29+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-11-06 16:34 NUMA scheduler BK tree Erich Focht
2002-11-06 18:10 ` Michael Hohnbaum
2002-11-07 23:05 ` Erich Focht
2002-11-07 23:46 ` Michael Hohnbaum
2002-11-08 16:57 ` Erich Focht
2002-11-11 15:13 ` [PATCH 2.5.47] NUMA scheduler (1/2) Erich Focht
2002-11-11 15:14 ` [PATCH 2.5.47] NUMA scheduler (2/2) Erich Focht
2002-11-12 0:24 ` [PATCH 2.5.47] NUMA scheduler (1/2) Michael Hohnbaum
2002-11-18 19:40 ` NUMA scheduler BK tree Martin J. Bligh
2002-11-19 16:26 ` [PATCH 2.5.48] NUMA scheduler (1/2) Erich Focht
2002-11-19 16:27 ` Erich Focht [this message]
2002-12-02 15:29 ` [PATCH 2.5.50] " Erich Focht
2002-12-02 15:30 ` [PATCH 2.5.50] NUMA scheduler (2/2) Erich Focht
2002-12-06 17:39 ` [PATCH 2.5.50] NUMA scheduler (1/2) Michael Hohnbaum
2002-12-18 16:21 ` [PATCH 2.5.52] " Erich Focht
2002-12-18 16:23 ` [PATCH 2.5.52] NUMA scheduler (2/2) Erich Focht
2002-12-20 14:49 ` [PATCH 2.5.52] NUMA scheduler: cputimes stats Erich Focht
2002-12-20 15:17 ` [PATCH 2.5.52] NUMA scheduler (1/2) Christoph Hellwig
2002-12-20 17:44 ` Erich Focht
2002-12-31 13:29 ` [PATCH 2.5.53] NUMA scheduler (1/3) Erich Focht
2002-12-31 13:29 ` [PATCH 2.5.53] NUMA scheduler (2/3) Erich Focht
2002-12-31 13:30 ` [PATCH 2.5.53] NUMA scheduler (3/3) Erich Focht
2003-01-04 1:58 ` [PATCH 2.5.53] NUMA scheduler (1/3) Michael Hohnbaum
2003-01-05 5:35 ` Martin J. Bligh
2003-01-06 3:58 ` Michael Hohnbaum
2003-01-06 6:07 ` Martin J. Bligh
2003-01-07 2:23 ` Michael Hohnbaum
2003-01-07 11:27 ` Erich Focht
2003-01-07 23:35 ` Michael Hohnbaum
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200211191727.23308.efocht@ess.nec.de \
--to=efocht@ess.nec.de \
--cc=anton@samba.org \
--cc=hohnbaum@us.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mbligh@aracnet.com \
--cc=mingo@elte.hu \
--cc=rml@tech9.net \
--cc=shemminger@osdl.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.