* [PATCH v2] sparc64: fix and optimize irq distribution
@ 2009-05-13 16:52 Hong H. Pham
2009-05-22 0:14 ` David Miller
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: Hong H. Pham @ 2009-05-13 16:52 UTC (permalink / raw)
To: sparclinux
irq_choose_cpu() should compare the affinity mask against cpu_online_map
rather than CPU_MASK_ALL, since irq_select_affinity() sets the interrupt's
affinity mask to cpu_online_map "and" CPU_MASK_ALL (which ends up being
just cpu_online_map). The mask comparison in irq_choose_cpu() will always
fail since the two masks are not the same. So the CPU chosen is the first CPU
in the intersection of cpu_online_map and CPU_MASK_ALL, which is always CPU0.
That means all interrupts are reassigned to CPU0...
Distributing interrupts to CPUs in a linearly increasing round robin fashion
is not optimal for the UltraSPARC T1/T2. Also, the irq_rover in
irq_choose_cpu() causes an interrupt to be assigned to a different
processor each time the interrupt is allocated and released. This may lead
to an unbalanced distribution over time.
A static mapping of interrupts to processors is done to optimize and balance
interrupt distribution. For the T1/T2, interrupts are spread to different
cores first, and then to strands within a core.
The following are benchmarks showing the effects of interrupt distribution
on a T2. The test was done with iperf using a pair of T5220 boxes, each
with a 10GBe NIU (XAUI) connected back to back.
TCP | Stock Linear RR IRQ Optimized IRQ
Streams | 2.6.30-rc5 Distribution Distribution
| GBits/sec GBits/sec GBits/sec
--------+-----------------------------------------
1 0.839 0.862 0.868
8 1.16 4.96 5.88
16 1.15 6.40 8.04
100 1.09 7.28 8.68
Signed-off-by: Hong H. Pham <hong.pham@windriver.com>
---
arch/sparc/kernel/Makefile | 1 +
arch/sparc/kernel/cpumap.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
arch/sparc/kernel/cpumap.h | 15 ++++++
arch/sparc/kernel/irq_64.c | 29 ++----------
arch/sparc/kernel/smp_64.c | 2 +
5 files changed, 132 insertions(+), 25 deletions(-)
create mode 100644 arch/sparc/kernel/cpumap.c
create mode 100644 arch/sparc/kernel/cpumap.h
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 54742e5..47029c6 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -53,8 +53,9 @@ obj-$(CONFIG_SPARC64) += hvapi.o
obj-$(CONFIG_SPARC64) += sstate.o
obj-$(CONFIG_SPARC64) += mdesc.o
obj-$(CONFIG_SPARC64) += pcr.o
obj-$(CONFIG_SPARC64) += nmi.o
+obj-$(CONFIG_SPARC64_SMP) += cpumap.o
# sparc32 do not use GENERIC_HARDIRQS but uses the generic devres implementation
obj-$(CONFIG_SPARC32) += devres.o
devres-y := ../../../kernel/irq/devres.o
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
new file mode 100644
index 0000000..0b1dce7
--- /dev/null
+++ b/arch/sparc/kernel/cpumap.c
@@ -0,0 +1,110 @@
+/* cpumap.c: used for optimizing CPU assignment
+ *
+ * Copyright (C) 2009 Hong H. Pham <hong.pham@windriver.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include "cpumap.h"
+
+
+static u16 cpu_distribution_map[NR_CPUS];
+static int cpu_map_entries = 0;
+static DEFINE_SPINLOCK(cpu_map_lock);
+
+
+static int strands_per_core(void)
+{
+ int n;
+
+ switch (sun4v_chip_type) {
+ case SUN4V_CHIP_NIAGARA1:
+ n = 4;
+ break;
+
+ case SUN4V_CHIP_NIAGARA2:
+ n = 8;
+ break;
+
+ default:
+ n = 1;
+ break;
+ }
+ return n;
+}
+
+static int iterate_cpu(unsigned int index)
+{
+ static unsigned int num_cpus = 0;
+ static unsigned int num_cores = 0;
+ unsigned int strand, s_per_core;
+
+ s_per_core = strands_per_core();
+
+ /* num_cpus must be a multiple of strands_per_core. */
+ if (unlikely(num_cores = 0)) {
+ num_cpus = num_possible_cpus();
+ num_cores = ((num_cpus / s_per_core) +
+ (num_cpus % s_per_core ? 1 : 0));
+ num_cpus = num_cores * s_per_core;
+ }
+
+ strand = (index * s_per_core) / num_cpus;
+
+ /* Optimize for the T2. Each core in the T2 has two instruction
+ * pipelines. Stagger the CPU distribution across different cores
+ * first, and then across different pipelines.
+ */
+ if (sun4v_chip_type = SUN4V_CHIP_NIAGARA2) {
+ if ((index / num_cores) & 0x01)
+ strand = s_per_core - strand;
+ }
+
+ return ((index * s_per_core) % num_cpus) + strand;
+}
+
+void cpu_map_init(void)
+{
+ int i, cpu, cpu_rover = 0;
+ unsigned long flag;
+
+ spin_lock_irqsave(&cpu_map_lock, flag);
+ for (i = 0; i < num_online_cpus(); i++) {
+ do {
+ cpu = iterate_cpu(cpu_rover++);
+ } while (!cpu_online(cpu));
+
+ cpu_distribution_map[i] = cpu;
+ }
+ cpu_map_entries = i;
+ spin_unlock_irqrestore(&cpu_map_lock, flag);
+}
+
+int map_to_cpu(unsigned int index)
+{
+ unsigned int mapped_cpu;
+ unsigned long flag;
+
+ spin_lock_irqsave(&cpu_map_lock, flag);
+ if (unlikely(cpu_map_entries != num_online_cpus())) {
+ spin_unlock_irqrestore(&cpu_map_lock, flag);
+ cpu_map_init();
+ spin_lock_irqsave(&cpu_map_lock, flag);
+ }
+
+ mapped_cpu = cpu_distribution_map[index % cpu_map_entries];
+#ifdef CONFIG_HOTPLUG_CPU
+ while (!cpu_online(mapped_cpu)) {
+ spin_unlock_irqrestore(&cpu_map_lock, flag);
+ cpu_map_init();
+ spin_lock_irqsave(&cpu_map_lock, flag);
+ mapped_cpu = cpu_distribution_map[index % cpu_map_entries];
+ }
+#endif /* CONFIG_HOTPLUG_CPU */
+ spin_unlock_irqrestore(&cpu_map_lock, flag);
+ return mapped_cpu;
+}
+EXPORT_SYMBOL(map_to_cpu);
diff --git a/arch/sparc/kernel/cpumap.h b/arch/sparc/kernel/cpumap.h
new file mode 100644
index 0000000..524b207
--- /dev/null
+++ b/arch/sparc/kernel/cpumap.h
@@ -0,0 +1,15 @@
+#ifndef _CPUMAP_H
+#define _CPUMAP_H
+
+#ifdef CONFIG_SMP
+extern void cpu_map_init(void);
+extern int map_to_cpu(unsigned int index);
+#else
+#define cpu_map_init() do {} while (0)
+static inline int map_to_cpu(unsigned int index)
+{
+ return raw_smp_processor_id();
+}
+#endif
+
+#endif
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 5deabe9..b68386d 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -44,8 +44,9 @@
#include <asm/hypervisor.h>
#include <asm/cacheflush.h>
#include "entry.h"
+#include "cpumap.h"
#define NUM_IVECS (IMAP_INR + 1)
struct ino_bucket *ivector_table;
@@ -255,37 +256,15 @@ static int irq_choose_cpu(unsigned int virt_irq)
cpumask_t mask;
int cpuid;
cpumask_copy(&mask, irq_desc[virt_irq].affinity);
- if (cpus_equal(mask, CPU_MASK_ALL)) {
- static int irq_rover;
- static DEFINE_SPINLOCK(irq_rover_lock);
- unsigned long flags;
-
- /* Round-robin distribution... */
- do_round_robin:
- spin_lock_irqsave(&irq_rover_lock, flags);
-
- while (!cpu_online(irq_rover)) {
- if (++irq_rover >= nr_cpu_ids)
- irq_rover = 0;
- }
- cpuid = irq_rover;
- do {
- if (++irq_rover >= nr_cpu_ids)
- irq_rover = 0;
- } while (!cpu_online(irq_rover));
-
- spin_unlock_irqrestore(&irq_rover_lock, flags);
+ if (cpus_equal(mask, cpu_online_map)) {
+ cpuid = map_to_cpu(virt_irq);
} else {
cpumask_t tmp;
cpus_and(tmp, cpu_online_map, mask);
-
- if (cpus_empty(tmp))
- goto do_round_robin;
-
- cpuid = first_cpu(tmp);
+ cpuid = cpus_empty(tmp) ? map_to_cpu(virt_irq) : first_cpu(tmp);
}
return cpuid;
}
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index f7642e5..54906aa 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1314,8 +1314,10 @@ int __cpu_disable(void)
ipi_call_lock();
cpu_clear(cpu, cpu_online_map);
ipi_call_unlock();
+ cpu_map_init();
+
return 0;
}
void __cpu_die(unsigned int cpu)
--
1.6.0.3
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH v2] sparc64: fix and optimize irq distribution
2009-05-13 16:52 [PATCH v2] sparc64: fix and optimize irq distribution Hong H. Pham
@ 2009-05-22 0:14 ` David Miller
2009-05-22 0:18 ` David Miller
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2009-05-22 0:14 UTC (permalink / raw)
To: sparclinux
From: "Hong H. Pham" <hong.pham@windriver.com>
Date: Wed, 13 May 2009 12:52:31 -0400
> irq_choose_cpu() should compare the affinity mask against cpu_online_map
> rather than CPU_MASK_ALL, since irq_select_affinity() sets the interrupt's
> affinity mask to cpu_online_map "and" CPU_MASK_ALL (which ends up being
> just cpu_online_map). The mask comparison in irq_choose_cpu() will always
> fail since the two masks are not the same. So the CPU chosen is the first CPU
> in the intersection of cpu_online_map and CPU_MASK_ALL, which is always CPU0.
> That means all interrupts are reassigned to CPU0...
>
> Distributing interrupts to CPUs in a linearly increasing round robin fashion
> is not optimal for the UltraSPARC T1/T2. Also, the irq_rover in
> irq_choose_cpu() causes an interrupt to be assigned to a different
> processor each time the interrupt is allocated and released. This may lead
> to an unbalanced distribution over time.
>
> A static mapping of interrupts to processors is done to optimize and balance
> interrupt distribution. For the T1/T2, interrupts are spread to different
> cores first, and then to strands within a core.
>
> The following are benchmarks showing the effects of interrupt distribution
> on a T2. The test was done with iperf using a pair of T5220 boxes, each
> with a 10GBe NIU (XAUI) connected back to back.
>
> TCP | Stock Linear RR IRQ Optimized IRQ
> Streams | 2.6.30-rc5 Distribution Distribution
> | GBits/sec GBits/sec GBits/sec
> --------+-----------------------------------------
> 1 0.839 0.862 0.868
> 8 1.16 4.96 5.88
> 16 1.15 6.40 8.04
> 100 1.09 7.28 8.68
>
> Signed-off-by: Hong H. Pham <hong.pham@windriver.com>
I like this patch a lot but it's going to do the wrong thing on
virtualized guests.
There is absolutely no connection between virtual cpu numbers
and the hierarchy in which they sit in the cores and higher
level hierarchy of the processor. So you can't just say
(cpu_id / 4) is the core number or anything like that.
You must use the machine description to determine this kind of
information, just as we do in arch/sparc/kernel/mdesc.c to figure out
the CPU scheduler grouping maps. (see mark_proc_ids() and
mark_core_ids())
This will also allow your code to transparently work on ROCK and other
future cpus without any changes.
I'm happy to apply this patch once you change it to use the MDESC
properly to probe the cpu hierarchy information.
^ permalink raw reply [flat|nested] 5+ messages in thread* Re: [PATCH v2] sparc64: fix and optimize irq distribution
2009-05-13 16:52 [PATCH v2] sparc64: fix and optimize irq distribution Hong H. Pham
2009-05-22 0:14 ` David Miller
@ 2009-05-22 0:18 ` David Miller
2009-05-22 21:55 ` Hong H. Pham
2009-05-23 0:21 ` David Miller
3 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2009-05-22 0:18 UTC (permalink / raw)
To: sparclinux
From: David Miller <davem@davemloft.net>
Date: Thu, 21 May 2009 17:14:24 -0700 (PDT)
> I'm happy to apply this patch once you change it to use the MDESC
> properly to probe the cpu hierarchy information.
BTW, you could also use the precomputed scheduler grouping
cpu masks in your distribution table building too.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v2] sparc64: fix and optimize irq distribution
2009-05-13 16:52 [PATCH v2] sparc64: fix and optimize irq distribution Hong H. Pham
2009-05-22 0:14 ` David Miller
2009-05-22 0:18 ` David Miller
@ 2009-05-22 21:55 ` Hong H. Pham
2009-05-23 0:21 ` David Miller
3 siblings, 0 replies; 5+ messages in thread
From: Hong H. Pham @ 2009-05-22 21:55 UTC (permalink / raw)
To: sparclinux
[-- Attachment #1: Type: text/plain, Size: 1872 bytes --]
David Miller wrote:
> There is absolutely no connection between virtual cpu numbers
> and the hierarchy in which they sit in the cores and higher
> level hierarchy of the processor. So you can't just say
> (cpu_id / 4) is the core number or anything like that.
>
> You must use the machine description to determine this kind of
> information, just as we do in arch/sparc/kernel/mdesc.c to figure out
> the CPU scheduler grouping maps. (see mark_proc_ids() and
> mark_core_ids())
Thanks for pointing me in this direction. mark_proc_ids() and
mark_core_ids() sets the core_id and proc_id members in the per
cpu __cpu_data. Looks like I can use cpu_data() to figure out
the CPU distribution.
As a side note, here's a dump of cpu_data() on a 2 way T5440.
There's a hole between 48 and 71.
[714162.134215] Brought up 96 CPUs
[714162.135440] CPU 0: node=0 core_id=1 proc_id=0
[714162.135452] CPU 1: node=0 core_id=1 proc_id=0
[714162.135464] CPU 2: node=0 core_id=1 proc_id=0
[714162.135475] CPU 3: node=0 core_id=1 proc_id=0
[714162.135487] CPU 4: node=0 core_id=1 proc_id=1
[714162.135498] CPU 5: node=0 core_id=1 proc_id=1
[714162.135509] CPU 6: node=0 core_id=1 proc_id=1
[714162.135521] CPU 7: node=0 core_id=1 proc_id=1
[714162.135532] CPU 8: node=0 core_id=2 proc_id=2
[714162.135544] CPU 9: node=0 core_id=2 proc_id=2
[714162.135555] CPU 10: node=0 core_id=2 proc_id=2
...
[714162.135961] CPU 45: node=0 core_id=6 proc_id=11
[714162.135973] CPU 46: node=0 core_id=6 proc_id=11
[714162.135984] CPU 47: node=0 core_id=6 proc_id=11
[714162.135996] CPU 72: node=1 core_id=7 proc_id=12
[714162.136008] CPU 73: node=1 core_id=7 proc_id=12
[714162.136019] CPU 74: node=1 core_id=7 proc_id=12
[714162.136031] CPU 75: node=1 core_id=7 proc_id=12
[714162.136043] CPU 76: node=1 core_id=7 proc_id=13
...
[714162.136554] CPU 119: node=1 core_id=12 proc_id=23
Regards,
Hong
[-- Attachment #2: dump_cpu_data.patch --]
[-- Type: text/x-patch, Size: 575 bytes --]
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 54906aa..7fa909f 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1353,8 +1353,20 @@ void __cpu_die(unsigned int cpu)
}
#endif
+static void dump_cpu_data(void)
+{
+ int i;
+
+ for_each_online_cpu(i) {
+ printk(KERN_DEBUG "CPU %i: node=%i core_id=%i proc_id=%i\n",
+ i, cpu_to_node(i),
+ cpu_data(i).core_id, cpu_data(i).proc_id);
+ }
+}
+
void __init smp_cpus_done(unsigned int max_cpus)
{
+ dump_cpu_data();
}
void smp_send_reschedule(int cpu)
^ permalink raw reply related [flat|nested] 5+ messages in thread* Re: [PATCH v2] sparc64: fix and optimize irq distribution
2009-05-13 16:52 [PATCH v2] sparc64: fix and optimize irq distribution Hong H. Pham
` (2 preceding siblings ...)
2009-05-22 21:55 ` Hong H. Pham
@ 2009-05-23 0:21 ` David Miller
3 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2009-05-23 0:21 UTC (permalink / raw)
To: sparclinux
From: "Hong H. Pham" <hong.pham@windriver.com>
Date: Fri, 22 May 2009 17:55:16 -0400
> As a side note, here's a dump of cpu_data() on a 2 way T5440.
> There's a hole between 48 and 71.
>
> [714162.134215] Brought up 96 CPUs
Of course there is, you only have 96 out of 128 cpus enabled so there
will be holes wherever the cores have been disabled.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2009-05-23 0:21 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-13 16:52 [PATCH v2] sparc64: fix and optimize irq distribution Hong H. Pham
2009-05-22 0:14 ` David Miller
2009-05-22 0:18 ` David Miller
2009-05-22 21:55 ` Hong H. Pham
2009-05-23 0:21 ` David Miller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.