From: Erich Focht <efocht@ess.nec.de>
To: "Martin J. Bligh" <mbligh@aracnet.com>,
linux-kernel <linux-kernel@vger.kernel.org>
Cc: LSE <lse-tech@lists.sourceforge.net>, Ingo Molnar <mingo@elte.hu>,
Michael Hohnbaum <hohnbaum@us.ibm.com>
Subject: Re: [Lse-tech] [PATCH 1/2] node affine NUMA scheduler
Date: Sun, 22 Sep 2002 12:35:13 +0200 [thread overview]
Message-ID: <200209221235.13341.efocht@ess.nec.de> (raw)
In-Reply-To: <598631797.1032601564@[10.10.2.3]>
[-- Attachment #1: Type: text/plain, Size: 1608 bytes --]
On Saturday 21 September 2002 18:46, Martin J. Bligh wrote:
> > Hmmm .... well I ran the One True Benchmark (tm). The patch
> > *increased* my kernel compile time from about 20s to about 28s.
> > Not sure I like that idea ;-) Anything you'd like tweaked, or
> > more info? Both user and system time were up ... I'll grab a
> > profile of kernel stuff.
>
> From the below, I'd suggest you're getting pages off the wrong
> nodes: do_anonymous_page is page zeroing, and rmqueue the buddy
> allocator. Are you sure the current->node thing is getting set
> correctly? I'll try backing out your alloc_pages tweaking, and
> see what happens.
Could you please check in dmesg whether the CPU pools are initialised
correctly? Maybe something goes wrong for your platform.
The node_distance is most probably non-optimal for NUMAQ, that might
need some tuning. The default is set for maximum 8 nodes, nodes 1-4
and 5-8 being in separate supernodes, with the latency ratios 1:1.5:2.
You could use the attached patch for getting an idea about the load
distribution. It's a quick&dirty hack which creates files called
/proc/sched/load/rqNN :load of RQs, including info on tasks not running
on their homenode
/proc/sched/history/ilbNN : history of last 25 initial load balancing
decisions for runqueue NN
/proc/sched/history/lbNN : last 25 load balancing decisions on rq NN.
It should be possible to find the reason for the poor performance by
looking at the nr_homenode entries in /proc/sched/load/rqNN.
Thanks,
best regards,
Erich
[-- Attachment #2: proc_sched_hist_2.5.37.patch --]
[-- Type: text/x-diff, Size: 6470 bytes --]
diff -urNp 2.5.37-node-affine/kernel/sched.c 2.5.37-node-affine-mon/kernel/sched.c
--- 2.5.37-node-affine/kernel/sched.c Sun Sep 22 11:13:59 2002
+++ 2.5.37-node-affine-mon/kernel/sched.c Sun Sep 22 11:29:26 2002
@@ -677,6 +677,175 @@ static inline unsigned int double_lock_b
return nr_running;
}
+#define HISTORY_RING_SIZE 25
+/* load balancing history entry */
+struct lb_hist_entry {
+ unsigned long time; /* jiffy */
+ int pid; /* stolen task (0 if none) */
+ int busiest_cpu; /* busiest RQ */
+};
+/* load balancing history ring */
+struct lb_hist_ring {
+ int curr; /* current pointer */
+ struct lb_hist_entry data[HISTORY_RING_SIZE];
+} ____cacheline_aligned;
+/* per CPU history ring array */
+struct lb_hist_ring lb_ring[NR_CPUS];
+
+/* initial load balancing decision entry */
+struct ilb_hist_entry {
+ unsigned long time; /* jiffy */
+ int pid;
+ int node; /* selected homenode */
+ int load[NR_NODES]; /* node loads at decision time */
+};
+/* initial load balancing history ring */
+struct ilb_hist_ring {
+ int curr; /* current pointer */
+ struct ilb_hist_entry data[HISTORY_RING_SIZE];
+} ____cacheline_aligned;
+/* per CPU history ring array */
+struct ilb_hist_ring ilb_ring[NR_CPUS];
+
+/* add entry to lb_ring */
+void lb_ring_add(int cpu, int pid, int busiest_cpu)
+{
+ int next=(lb_ring[cpu].curr + 1 ) % HISTORY_RING_SIZE;
+
+ lb_ring[cpu].data[next].time = jiffies;
+ lb_ring[cpu].data[next].pid = pid;
+ lb_ring[cpu].data[next].busiest_cpu = busiest_cpu;
+ lb_ring[cpu].curr = next;
+}
+
+/* add entry to ilb_ring */
+void ilb_ring_add(int cpu, int pid, int node, int *load)
+{
+ int i, next=(ilb_ring[cpu].curr + 1 ) % HISTORY_RING_SIZE;
+
+ ilb_ring[cpu].data[next].time = jiffies;
+ ilb_ring[cpu].data[next].pid = pid;
+ ilb_ring[cpu].data[next].node = node;
+ for (i=0; i<numpools; i++)
+ ilb_ring[cpu].data[next].load[i] = load[i];
+ ilb_ring[cpu].curr = next;
+}
+
+/* print lb history ring buffer */
+int lb_ring_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int i, len, entry;
+ char *buff=page;
+ int cpu=(int)data;
+
+ buff += sprintf(buff," tick pid from_cpu\n");
+ entry = lb_ring[cpu].curr;
+ for (i=0; i<HISTORY_RING_SIZE; i++) {
+ entry = (entry + 1) % HISTORY_RING_SIZE;
+ buff += sprintf(buff,"%12ld %6d %2d\n",
+ lb_ring[cpu].data[entry].time,
+ lb_ring[cpu].data[entry].pid,
+ lb_ring[cpu].data[entry].busiest_cpu);
+ }
+ len = buff-page;
+ if (len <= off+count) *eof = 1;
+ len -= off;
+ if (len>count) len = count;
+ if (len<0) len = 0;
+ return len;
+}
+
+/* print initial lb history ring buffer */
+int ilb_ring_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int i, j, len, entry;
+ char *buff=page;
+ int cpu=(int)data;
+
+ buff += sprintf(buff," tick pid node node_loads\n");
+ entry = ilb_ring[cpu].curr;
+ for (i=0; i<HISTORY_RING_SIZE; i++) {
+ entry = (entry + 1) % HISTORY_RING_SIZE;
+ buff += sprintf(buff,"%12ld %6d %2d",
+ ilb_ring[cpu].data[entry].time,
+ ilb_ring[cpu].data[entry].pid,
+ ilb_ring[cpu].data[entry].node);
+ for (j=0; j<numpools; j++)
+ buff += sprintf(buff," %3d",
+ ilb_ring[cpu].data[entry].load[j]);
+ buff += sprintf(buff,"\n");
+ }
+ len = buff-page;
+ if (len <= off+count) *eof = 1;
+ len -= off;
+ if (len>count) len = count;
+ if (len<0) len = 0;
+ return len;
+}
+
+/* print runqueue load */
+int rq_load_read_proc(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int i, len;
+ runqueue_t *rq;
+ char *buff=page;
+ int cpu=(int)data;
+
+ rq=cpu_rq(cpu);
+ buff += sprintf(buff,"cpu %d : ",cpu);
+ buff += sprintf(buff,"curr: %d %s\n",rq->curr->pid,rq->curr->comm);
+ buff += sprintf(buff,"running uninter nr_homenode\n");
+ buff += sprintf(buff,"%7d %7d",rq->nr_running,rq->nr_uninterruptible);
+ for (i=0; i<numpools; i++)
+ buff += sprintf(buff," %4d",rq->nr_homenode[i]);
+ buff += sprintf(buff,"\n");
+
+ len = buff-page;
+ if (len <= off+count) *eof = 1;
+ len -= off;
+ if (len>count) len = count;
+ if (len<0) len = 0;
+ return len;
+}
+
+#include <linux/proc_fs.h>
+/* initialize /proc entries */
+void init_sched_proc(void)
+{
+ int i;
+ char name[12];
+ struct proc_dir_entry *p, *hist, *sched, *load;
+
+ sched = proc_mkdir("sched",&proc_root);
+ hist = proc_mkdir("history",sched);
+ for (i=0; i<smp_num_cpus; i++) {
+ sprintf(name,"lb%02d",i);
+ p = create_proc_entry(name,S_IRUGO,hist);
+ if (p) {
+ p->read_proc = lb_ring_read_proc;
+ p->data = (long)i;
+ }
+ sprintf(name,"ilb%02d",i);
+ p = create_proc_entry(name,S_IRUGO,hist);
+ if (p) {
+ p->read_proc = ilb_ring_read_proc;
+ p->data = (long)i;
+ }
+ }
+ load = proc_mkdir("load",sched);
+ for (i=0; i<smp_num_cpus; i++) {
+ sprintf(name,"rq%02d",i);
+ p = create_proc_entry(name,S_IRUGO,load);
+ if (p) {
+ p->read_proc = rq_load_read_proc;
+ p->data = (long)i;
+ }
+ }
+}
+
/*
* Calculate load of a CPU pool, store results in data[][NR_CPUS].
* Return the index of the most loaded runqueue.
@@ -961,6 +1130,7 @@ static void load_balance(runqueue_t *thi
tmp = task_to_steal(busiest, this_cpu);
if (!tmp)
goto out_unlock;
+ lb_ring_add(smp_processor_id(), tmp->pid, tmp->thread_info->cpu);
pull_task(busiest, tmp->array, tmp, this_rq, this_cpu);
out_unlock:
spin_unlock(&busiest->lock);
@@ -2051,7 +2221,7 @@ static int sched_best_cpu(struct task_st
*/
static int sched_best_node(struct task_struct *p, int flag)
{
- int n, best_node=0, min_load, pool_load, min_pool=p->node;
+ int n, best_node=0, min_load, min_pool=p->node;
int pool, load[NR_NODES];
unsigned long mask = p->cpus_allowed & cpu_online_map;
@@ -2079,13 +2249,14 @@ static int sched_best_node(struct task_s
min_load = 100000000;
for (n = 0; n < numpools; n++) {
pool = (best_node + n) % numpools;
- pool_load = (100*load[pool])/pool_nr_cpus[pool];
- if ((pool_load < min_load) && (pool_mask[pool] & mask)) {
- min_load = pool_load;
+ load[pool] = (100*load[pool])/pool_nr_cpus[pool];
+ if ((load[pool] < min_load) && (pool_mask[pool] & mask)) {
+ min_load = load[pool];
min_pool = pool;
}
}
atomic_set(&sched_node, min_pool);
+ ilb_ring_add(smp_processor_id(), p->pid, min_pool, load);
return min_pool;
}
@@ -2282,6 +2453,7 @@ void bld_pools(void)
find_node_levels(numpools);
init_pool_weight();
init_pool_delay();
+ init_sched_proc();
}
void set_task_node(task_t *p, int node)
next prev parent reply other threads:[~2002-09-22 10:31 UTC|newest]
Thread overview: 27+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-09-21 9:59 [PATCH 1/2] node affine NUMA scheduler Erich Focht
2002-09-21 10:02 ` [PATCH 2/2] " Erich Focht
2002-09-21 15:55 ` [Lse-tech] [PATCH 1/2] " Martin J. Bligh
2002-09-21 16:32 ` Martin J. Bligh
2002-09-21 16:46 ` Martin J. Bligh
2002-09-21 17:11 ` Martin J. Bligh
2002-09-21 17:32 ` Erich Focht
2002-09-21 17:38 ` William Lee Irwin III
2002-09-21 23:18 ` William Lee Irwin III
2002-09-22 8:09 ` William Lee Irwin III
2002-09-22 8:30 ` Erich Focht
2002-09-22 17:11 ` Martin J. Bligh
2002-09-22 19:20 ` Martin J. Bligh
2002-09-22 21:59 ` Erich Focht
2002-09-22 22:36 ` William Lee Irwin III
2002-09-22 22:51 ` Martin J. Bligh
2002-09-23 18:19 ` node affine NUMA scheduler: simple benchmark Erich Focht
2002-09-22 10:35 ` Erich Focht [this message]
2002-09-22 10:45 ` [Lse-tech] [PATCH 1/2] node affine NUMA scheduler Erich Focht
2002-09-22 14:57 ` Martin J. Bligh
2002-09-23 18:38 ` Erich Focht
2002-09-23 18:47 ` Martin J. Bligh
2002-09-24 21:04 ` Erich Focht
2002-09-24 21:17 ` Martin J. Bligh
2002-09-22 15:52 ` Martin J. Bligh
2002-09-22 19:24 ` Martin J. Bligh
2002-09-24 23:59 ` Matthew Dobson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200209221235.13341.efocht@ess.nec.de \
--to=efocht@ess.nec.de \
--cc=hohnbaum@us.ibm.com \
--cc=linux-kernel@vger.kernel.org \
--cc=lse-tech@lists.sourceforge.net \
--cc=mbligh@aracnet.com \
--cc=mingo@elte.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox