[PATCH] NUMA-Q disable irqbalance

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] NUMA-Q disable irqbalance
@ 2002-08-05 23:51 Martin J. Bligh
  2002-08-13 16:13 ` Martin J. Bligh
  0 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-05 23:51 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
and makes it a config option for everyone else. This is needed for NUMA-Q
to work, since the irq_balance code assumes a logical flat apic addressing
mode that's not true in all cases. We created a config option since 
irq_balance makes performance significantly worse for some workloads.

Please apply,

Martin.

diff -Nur linux-2.5.25-vanilla/arch/i386/Config.help linux-2.5.25-patched/arch/i386/Config.help
--- linux-2.5.25-vanilla/arch/i386/Config.help	Fri Jul  5 16:42:04 2002
+++ linux-2.5.25-patched/arch/i386/Config.help	Thu Jul 11 17:27:01 2002
@@ -41,6 +41,12 @@
   486, 586, Pentiums, and various instruction-set-compatible chips by
   AMD, Cyrix, and others.
 
+CONFIG_IRQ_BALANCE
+  This option is used to turn IRQ Balancing on machines with multiple
+  APIC's (ie: SMP, NUMA, etc) on or off.  This behavior has been seen 
+  under some conditions to reduce performance, and on some platorms causes
+  interesting hangs, particularly those with more than 8 CPUs.
+
 CONFIG_MULTIQUAD
   This option is used for getting Linux to run on a (IBM/Sequent) NUMA 
   multiquad box. This changes the way that processors are bootstrapped,
diff -Nur linux-2.5.25-vanilla/arch/i386/config.in linux-2.5.25-patched/arch/i386/config.in
--- linux-2.5.25-vanilla/arch/i386/config.in	Fri Jul  5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/config.in	Thu Jul 11 17:16:52 2002
@@ -164,8 +164,19 @@
    if [ "$CONFIG_X86_UP_IOAPIC" = "y" ]; then
       define_bool CONFIG_X86_IO_APIC y
    fi
+   define_bool CONFIG_IRQBALANCE n
 else
    bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+   if [ "$CONFIG_MULTIQUAD" = "y" ]; then
+      define_bool CONFIG_IRQBALANCE_DISABLE y
+   else
+      bool 'Turn Off IRQ Balancing' CONFIG_IRQBALANCE_DISABLE
+   fi
+   if [ "$CONFIG_IRQBALANCE_DISABLE" = "y" ]; then
+      define_bool CONFIG_IRQBALANCE n
+   else
+      define_bool CONFIG_IRQBALANCE y
+   fi
 fi
 
 bool 'Machine Check Exception' CONFIG_X86_MCE
diff -Nur linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c linux-2.5.25-patched/arch/i386/kernel/io_apic.c
--- linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c	Fri Jul  5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/kernel/io_apic.c	Thu Jul 11 16:12:28 2002
@@ -199,7 +199,7 @@
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#if CONFIG_SMP
+#if CONFIG_IRQBALANCE
 
 typedef struct {
 	unsigned int cpu;
@@ -211,15 +211,12 @@
 
 extern unsigned long irq_affinity [NR_IRQS];
 
-#endif
-
 #define IDLE_ENOUGH(cpu,now) \
 		(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
 
 #define IRQ_ALLOWED(cpu,allowed_mask) \
 		((1 << cpu) & (allowed_mask))
 
-#if CONFIG_SMP
 static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
 {
 	int search_idle = 1;
@@ -264,9 +261,9 @@
 		set_ioapic_affinity(irq, 1 << entry->cpu);
 	}
 }
-#else /* !SMP */
+#else /* !CONFIG_IRQBALANCE */
 static inline void balance_irq(int irq) { }
-#endif
+#endif /* CONFIG_IRQBALANCE */
 
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] NUMA-Q disable irqbalance
  2002-08-05 23:51 [PATCH] NUMA-Q disable irqbalance Martin J. Bligh
@ 2002-08-13 16:13 ` Martin J. Bligh
  2002-08-13 16:41   ` Linus Torvalds
  0 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 16:13 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
and makes it a config option for everyone else. This is needed for NUMA-Q
to work, since the irq_balance code assumes a logical flat apic addressing
mode that's not true in all cases. We created a config option since 
irq_balance makes performance significantly worse for some workloads.
Says it's against 2.5.25, but applies to and works on 2.5.31

Please apply,

Martin.

diff -Nur linux-2.5.25-vanilla/arch/i386/Config.help linux-2.5.25-patched/arch/i386/Config.help
--- linux-2.5.25-vanilla/arch/i386/Config.help	Fri Jul  5 16:42:04 2002
+++ linux-2.5.25-patched/arch/i386/Config.help	Thu Jul 11 17:27:01 2002
@@ -41,6 +41,12 @@
   486, 586, Pentiums, and various instruction-set-compatible chips by
   AMD, Cyrix, and others.
 
+CONFIG_IRQ_BALANCE
+  This option is used to turn IRQ Balancing on machines with multiple
+  APIC's (ie: SMP, NUMA, etc) on or off.  This behavior has been seen 
+  under some conditions to reduce performance, and on some platorms causes
+  interesting hangs, particularly those with more than 8 CPUs.
+
 CONFIG_MULTIQUAD
   This option is used for getting Linux to run on a (IBM/Sequent) NUMA 
   multiquad box. This changes the way that processors are bootstrapped,
diff -Nur linux-2.5.25-vanilla/arch/i386/config.in linux-2.5.25-patched/arch/i386/config.in
--- linux-2.5.25-vanilla/arch/i386/config.in	Fri Jul  5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/config.in	Thu Jul 11 17:16:52 2002
@@ -164,8 +164,19 @@
    if [ "$CONFIG_X86_UP_IOAPIC" = "y" ]; then
       define_bool CONFIG_X86_IO_APIC y
    fi
+   define_bool CONFIG_IRQBALANCE n
 else
    bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+   if [ "$CONFIG_MULTIQUAD" = "y" ]; then
+      define_bool CONFIG_IRQBALANCE_DISABLE y
+   else
+      bool 'Turn Off IRQ Balancing' CONFIG_IRQBALANCE_DISABLE
+   fi
+   if [ "$CONFIG_IRQBALANCE_DISABLE" = "y" ]; then
+      define_bool CONFIG_IRQBALANCE n
+   else
+      define_bool CONFIG_IRQBALANCE y
+   fi
 fi
 
 bool 'Machine Check Exception' CONFIG_X86_MCE
diff -Nur linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c linux-2.5.25-patched/arch/i386/kernel/io_apic.c
--- linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c	Fri Jul  5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/kernel/io_apic.c	Thu Jul 11 16:12:28 2002
@@ -199,7 +199,7 @@
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#if CONFIG_SMP
+#if CONFIG_IRQBALANCE
 
 typedef struct {
 	unsigned int cpu;
@@ -211,15 +211,12 @@
 
 extern unsigned long irq_affinity [NR_IRQS];
 
-#endif
-
 #define IDLE_ENOUGH(cpu,now) \
 		(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))
 
 #define IRQ_ALLOWED(cpu,allowed_mask) \
 		((1 << cpu) & (allowed_mask))
 
-#if CONFIG_SMP
 static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
 {
 	int search_idle = 1;
@@ -264,9 +261,9 @@
 		set_ioapic_affinity(irq, 1 << entry->cpu);
 	}
 }
-#else /* !SMP */
+#else /* !CONFIG_IRQBALANCE */
 static inline void balance_irq(int irq) { }
-#endif
+#endif /* CONFIG_IRQBALANCE */
 
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 16:13 ` Martin J. Bligh
@ 2002-08-13 16:41   ` Linus Torvalds
  2002-08-13 16:57     ` Alan Cox
  2002-08-13 17:14     ` Martin J. Bligh
  0 siblings, 2 replies; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 16:41 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel

On Tue, 13 Aug 2002, Martin J. Bligh wrote:
:
> This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
> and makes it a config option for everyone else.

Please don't use negative config options.

I'd much rather have

	bool 'IRQ balancing support' CONFIG_IRQ_BALANCE

than some "Disable IRQ balancing?" question.

Also, the explanation should probably explain that a P4 needs manual IRQ 
balancing since the P4 broke the Intel-documented round-robin behaviour.

Finally, exactly since IRQ balancing is practically required on P4-SMP, I
really don't think a CONFIG option works. It needs to be configured in on
any kernel that expects to use P4's in an SMP configuration.

In other words, I think this needs to do a dynamic disable (with the 
possible exception of a NUMA-Q machine, since that one is already a static 
config option and won't have P4's in it).

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 16:41   ` Linus Torvalds
@ 2002-08-13 16:57     ` Alan Cox
  2002-08-13 17:24       ` Martin J. Bligh
  2002-08-13 17:14     ` Martin J. Bligh
  1 sibling, 1 reply; 34+ messages in thread
From: Alan Cox @ 2002-08-13 16:57 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Martin J. Bligh, linux-kernel

On Tue, 2002-08-13 at 17:41, Linus Torvalds wrote:
> Finally, exactly since IRQ balancing is practically required on P4-SMP, I
> really don't think a CONFIG option works. It needs to be configured in on
> any kernel that expects to use P4's in an SMP configuration.
> 
> In other words, I think this needs to do a dynamic disable (with the 
> possible exception of a NUMA-Q machine, since that one is already a static 
> config option and won't have P4's in it).

In the 2.4-ac tree it is a dynamic disable keyed off the mp 1.4 tables.
That's how James Cleverdon (I think it was he) implemented the detection
logic and mixed summit/sane-pc kernel build that seems to work well now



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 16:41   ` Linus Torvalds
  2002-08-13 16:57     ` Alan Cox
@ 2002-08-13 17:14     ` Martin J. Bligh
  2002-08-13 17:24       ` Linus Torvalds
  1 sibling, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 17:14 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Matt Dobson

>> This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
>> and makes it a config option for everyone else.
> 
> Please don't use negative config options.
> 
> I'd much rather have
> 
> 	bool 'IRQ balancing support' CONFIG_IRQ_BALANCE
> 
> than some "Disable IRQ balancing?" question.

That piece of wierdness I'll take the blame for, not Matt. It's more complex, we 
just wanted to make the default be to have it turned on, maybe we've missed 
some way in the config to make it work like that?

> Also, the explanation should probably explain that a P4 needs manual IRQ 
> balancing since the P4 broke the Intel-documented round-robin behaviour.
> 
> Finally, exactly since IRQ balancing is practically required on P4-SMP, I
> really don't think a CONFIG option works. It needs to be configured in on
> any kernel that expects to use P4's in an SMP configuration.

That'd be easier if it was written in such a way it worked on all P4 systems.
For one, it does an rdtsc directly, which doesn't work on all machines.
For another, it assumes flat logical addressing mode. We need to be able
to disable this until it's rewritten in such a way as to be more generic.

> In other words, I think this needs to do a dynamic disable (with the 
> possible exception of a NUMA-Q machine, since that one is already a static 
> config option and won't have P4's in it).

Was there some reason you really need this on P4s? I seem to recall something
to do with timer interrupts, but don't remember exactly.

M.

PS. I think __DO_ACTION gets an award for disgusting use of macros.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 17:14     ` Martin J. Bligh
@ 2002-08-13 17:24       ` Linus Torvalds
  2002-08-13 18:02         ` Martin J. Bligh
  0 siblings, 1 reply; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 17:24 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, Matt Dobson


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> 
> Was there some reason you really need this on P4s? I seem to recall something
> to do with timer interrupts, but don't remember exactly.

Without the explicit balancing, _every_single_ external interrupt comes in 
on CPU0 on a P4.

The P4 local APIC doesn't do irq scheduling in hardware (never mind that
Intel documented it as architecture behaviour in earlier local APICs)

		Linus


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 16:57     ` Alan Cox
@ 2002-08-13 17:24       ` Martin J. Bligh
  2002-08-13 17:38         ` Alan Cox
  0 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 17:24 UTC (permalink / raw)
  To: Alan Cox, Linus Torvalds; +Cc: linux-kernel

> In the 2.4-ac tree it is a dynamic disable keyed off the mp 1.4 tables.
> That's how James Cleverdon (I think it was he) implemented the detection
> logic and mixed summit/sane-pc kernel build that seems to work well now

The trouble with that is that is it doesn't provide an interface for people to
disable it by hand for the many cases where constantly reprogramming 
the IO-APIC reduces the performance of their workload.

M.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 17:24       ` Martin J. Bligh
@ 2002-08-13 17:38         ` Alan Cox
  0 siblings, 0 replies; 34+ messages in thread
From: Alan Cox @ 2002-08-13 17:38 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Linus Torvalds, linux-kernel

On Tue, 2002-08-13 at 18:24, Martin J. Bligh wrote:
> > In the 2.4-ac tree it is a dynamic disable keyed off the mp 1.4 tables.
> > That's how James Cleverdon (I think it was he) implemented the detection
> > logic and mixed summit/sane-pc kernel build that seems to work well now
> 
> The trouble with that is that is it doesn't provide an interface for people to
> disable it by hand for the many cases where constantly reprogramming 
> the IO-APIC reduces the performance of their workload.

I have zero problems with also being able to manually disable it


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 17:24       ` Linus Torvalds
@ 2002-08-13 18:02         ` Martin J. Bligh
  2002-08-13 18:20           ` Linus Torvalds
  0 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 18:02 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Matt Dobson

>> Was there some reason you really need this on P4s? I seem to recall something
>> to do with timer interrupts, but don't remember exactly.
> 
> Without the explicit balancing, _every_single_ external interrupt comes in 
> on CPU0 on a P4.
> 
> The P4 local APIC doesn't do irq scheduling in hardware (never mind that
> Intel documented it as architecture behaviour in earlier local APICs)

I know, but you pays your money, you choose your breakage ;-)
I can't help feeling that the real solution is to program the TPR like Intel intended,
instead of frigging with the IO-APIC, especially when the the code that does the
frigging is written with incorrect assumuptions. 

Forcing it on for every machine just because P4s are borked sounds wrong.
The current code has several issues (including, I believe, being frequency
dependent on jiffies ... bad with 1000 HZ), so we really do need to disable
it for many machines. Getting rid of the negative config option is easy though.

M.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 18:02         ` Martin J. Bligh
@ 2002-08-13 18:20           ` Linus Torvalds
  2002-08-13 18:58             ` Martin J. Bligh
  0 siblings, 1 reply; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 18:20 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> 
> I know, but you pays your money, you choose your breakage ;-)

Well, in this case, it is _you_ who end up having to choose your breakage.

> Forcing it on for every machine just because P4s are borked sounds wrong.

THAT IS NOT WHAT I SAID. Go back and read it. I said that since the P4
needs it, you don't have the choice of just ignoring it. Especially since
there are about a million more P4's out there than NUMA-Q machines.

It needs to be dynamic, not "disable it".

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 18:20           ` Linus Torvalds
@ 2002-08-13 18:58             ` Martin J. Bligh
  2002-08-13 19:22               ` Linus Torvalds
  0 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 18:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Matt Dobson

>> I know, but you pays your money, you choose your breakage ;-)
> 
> Well, in this case, it is _you_ who end up having to choose your breakage.

Indeed. Empower the user.

>> Forcing it on for every machine just because P4s are borked sounds wrong.
> 
> THAT IS NOT WHAT I SAID. Go back and read it. I said that since the P4

OK, I was being unclear, that's not really what I meant. If I may rephrase:
I don't like the performance hit it gives on P3 standard SMP machines (not
NUMA-Q) though it does work on there too, and there's no easy way for 
people to disable it.

> needs it, you don't have the choice of just ignoring it. Especially since
> there are about a million more P4's out there than NUMA-Q machines.
>  
> It needs to be dynamic, not "disable it".

I did read what you said, but this isn't just about NUMA-Q. There are two issues here:

1. It doesn't work at all for some machines. Yes, we can get rid of that problem by
dynamic disable, or hanging off the existing config option. If that's all you'll accept,
we can cut a minimal patch to do that (that's actually what Matt did originally)

2. It makes performance worse on some normal SMP machines. That's what I want
the manual config option for. If you want it forced on for P4s, fine. I suspect this
may be able to be removed if someone can fix the code so it's performant.

M.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 18:58             ` Martin J. Bligh
@ 2002-08-13 19:22               ` Linus Torvalds
  2002-08-13 20:04                 ` Martin J. Bligh
                                   ` (2 more replies)
  0 siblings, 3 replies; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 19:22 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, Matt Dobson


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> 
> OK, I was being unclear, that's not really what I meant. If I may rephrase:
> I don't like the performance hit it gives on P3 standard SMP machines (not
> NUMA-Q) though it does work on there too, and there's no easy way for 
> people to disable it.

Well, it makes performance _so_ much better on a P4 that it's not even 
funny. It's basically a "P4 is unusable with SMP" without it.

		Linus


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 19:22               ` Linus Torvalds
@ 2002-08-13 20:04                 ` Martin J. Bligh
  2002-08-13 20:22                   ` Linus Torvalds
  2002-08-13 20:22                 ` Alan Cox
  2002-08-24 12:19                 ` Zwane Mwaikambo
  2 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 20:04 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Matt Dobson

>> OK, I was being unclear, that's not really what I meant. If I may rephrase:
>> I don't like the performance hit it gives on P3 standard SMP machines (not
>> NUMA-Q) though it does work on there too, and there's no easy way for 
>> people to disable it.
> 
> Well, it makes performance _so_ much better on a P4 that it's not even 
> funny. It's basically a "P4 is unusable with SMP" without it.

Right, accepted. But if it's good for P4, and bad for P3 (at least for some 
workloads), surely this leads to the conclusion that it should be a config 
option (probably defaulting to being on)? If you can see another way to
solve the conundrum ....

I can understand you didn't like the negative stuff, but there's a choice 
between two evils ... - some mess in the config.in file, and having a sensible
default. We will cut that one whichever way you like (though I can't help
thinking I'm missing some obvious default thing in the config language).

If you still hate it, the other option I can see is to cut a minimal patch for now, 
which institutes a CONFIG_IRQ_BALANCE, but just decides that off the existing
config options, rather than asking any more questions. Then we'll look at
the perf problems seperately, and see if we can fix them - people can force
it off by editing the .config file by hand if they want until then.

M.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:04                 ` Martin J. Bligh
@ 2002-08-13 20:22                   ` Linus Torvalds
  2002-08-14  5:52                     ` Martin J. Bligh
  2002-08-14 10:10                     ` Jos Hulzink
  0 siblings, 2 replies; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 20:22 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> 
> Right, accepted. But if it's good for P4, and bad for P3 (at least for some 
> workloads), surely this leads to the conclusion that it should be a config 
> option (probably defaulting to being on)? If you can see another way to
> solve the conundrum ....

But this is exactly the kinds of cases that config options do _not_ work 
well for.

There are tons of reasons to run the same kernel on a multitude of 
machines, even ignoring the issue of things like installers etc. 

We had this CONFIG_xxxx disease when it came to SSE, we had it when it 
came to TSC, etc. And in every case it ended up being bad, simply because 
it's not the right interface for _users_. 

So this is why I think the IRQ balance code has to be there, regardless, 
and then it gets turned on dynamically for when it is needed (or turned 
off when it hurts, whatever). But it should _not_ be a CONFIG option.

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 19:22               ` Linus Torvalds
  2002-08-13 20:04                 ` Martin J. Bligh
@ 2002-08-13 20:22                 ` Alan Cox
  2002-08-13 20:35                   ` Linus Torvalds
  2002-08-24 12:19                 ` Zwane Mwaikambo
  2 siblings, 1 reply; 34+ messages in thread
From: Alan Cox @ 2002-08-13 20:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 2002-08-13 at 20:22, Linus Torvalds wrote:
> 
> On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> > 
> > OK, I was being unclear, that's not really what I meant. If I may rephrase:
> > I don't like the performance hit it gives on P3 standard SMP machines (not
> > NUMA-Q) though it does work on there too, and there's no easy way for 
> > people to disable it.
> 
> Well, it makes performance _so_ much better on a P4 that it's not even 
> funny. It's basically a "P4 is unusable with SMP" without it.

On a collection of networking workloads the P4 is about 5% better
performing with the irq balancer off.


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:35                   ` Linus Torvalds
@ 2002-08-13 20:34                     ` Alan Cox
  2002-08-13 20:42                     ` Martin J. Bligh
  2002-08-13 22:08                     ` Rik van Riel
  2 siblings, 0 replies; 34+ messages in thread
From: Alan Cox @ 2002-08-13 20:34 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 2002-08-13 at 21:35, Linus Torvalds wrote:
> Now, there have been other changes too - like the scheduler (and my
> current P4 has a different SCSI interface), but I dunno. The thing I 
> attributed the improvements in interactive feel was the fact that the work 
> got balanced out more sanely.

I've been benching 2.4 with the O(1) scheduler and the process load
balancing changes and seeing this. Conceptually I can understand why it
would occur (cache affinity) but if so we ought to do some much slower
(maybe once every 5 seconds based on accumulated averages) balancing not
none


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:22                 ` Alan Cox
@ 2002-08-13 20:35                   ` Linus Torvalds
  2002-08-13 20:34                     ` Alan Cox
                                       ` (2 more replies)
  0 siblings, 3 replies; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 20:35 UTC (permalink / raw)
  To: Alan Cox; +Cc: Martin J. Bligh, linux-kernel, Matt Dobson

On 13 Aug 2002, Alan Cox wrote:
> 
> On a collection of networking workloads the P4 is about 5% better
> performing with the irq balancer off.

Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues 
were fixed.

Now, there have been other changes too - like the scheduler (and my
current P4 has a different SCSI interface), but I dunno. The thing I 
attributed the improvements in interactive feel was the fact that the work 
got balanced out more sanely.

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:35                   ` Linus Torvalds
  2002-08-13 20:34                     ` Alan Cox
@ 2002-08-13 20:42                     ` Martin J. Bligh
  2002-08-13 21:24                       ` Linus Torvalds
  2002-08-13 22:08                     ` Rik van Riel
  2 siblings, 1 reply; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-13 20:42 UTC (permalink / raw)
  To: Linus Torvalds, Alan Cox; +Cc: linux-kernel, Andrew Theurer

>> On a collection of networking workloads the P4 is about 5% better
>> performing with the irq balancer off.
> 
> Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues 
> were fixed.
> 
> Now, there have been other changes too - like the scheduler (and my
> current P4 has a different SCSI interface), but I dunno. The thing I 
> attributed the improvements in interactive feel was the fact that the work 
> got balanced out more sanely.

Was that before or after you changed HZ to 1000? I *think* that increased
the frequency of IO-APIC reprogramming by a factor of 10, though I might
be misreading the code. If it does depend on HZ, I think that's bad.

People in our benchmarking group (Andrew, cc'ed) have told me that 
reducing the frequency of IO-APIC reprogramming by a factor of 20 or so
improves performance greatly  - don't know what HZ that was at, but the
whole thing seems a little overenthusiastic to me.

M.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:42                     ` Martin J. Bligh
@ 2002-08-13 21:24                       ` Linus Torvalds
  2002-08-13 22:29                         ` Andrew Theurer
  2002-08-20  0:49                         ` [PATCH] NUMA-Q disable irqbalance Dave Hansen
  0 siblings, 2 replies; 34+ messages in thread
From: Linus Torvalds @ 2002-08-13 21:24 UTC (permalink / raw)
  To: Martin J. Bligh; +Cc: Alan Cox, linux-kernel, Andrew Theurer

On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> 
> Was that before or after you changed HZ to 1000? I *think* that increased
> the frequency of IO-APIC reprogramming by a factor of 10, though I might
> be misreading the code. If it does depend on HZ, I think that's bad.

The 1000Hz thing came much later, and I never noticed any impact of that 
on my machines.

(Note that this is all entrely subjective. I was very disappointed in the
feel of the first HT P4 machine I had for the first few weeks, but apart
from running lmbench - which looked ok even though it shows that P4's are
bad at system calls - I've not actually put numbers on it. But my feeling
was that the irq thing made a noticeable difference. Caveat emptor -
subjective feelings are not good).

> People in our benchmarking group (Andrew, cc'ed) have told me that
> reducing the frequency of IO-APIC reprogramming by a factor of 20 or
> so improves performance greatly - don't know what HZ that was at, but
> the whole thing seems a little overenthusiastic to me.

The rebalancing was certainly done with a 100Hz clock, so yes, it might 
have become much worse lately.

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:35                   ` Linus Torvalds
  2002-08-13 20:34                     ` Alan Cox
  2002-08-13 20:42                     ` Martin J. Bligh
@ 2002-08-13 22:08                     ` Rik van Riel
  2002-08-13 22:14                       ` Rik van Riel
  2 siblings, 1 reply; 34+ messages in thread
From: Rik van Riel @ 2002-08-13 22:08 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Alan Cox, Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Linus Torvalds wrote:
> On 13 Aug 2002, Alan Cox wrote:
> >
> > On a collection of networking workloads the P4 is about 5% better
> > performing with the irq balancer off.
>
> Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues
> were fixed.

"If you can't measure it, it doesn't exist"

*runs like hell*

cheers,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 22:08                     ` Rik van Riel
@ 2002-08-13 22:14                       ` Rik van Riel
  2002-08-14 14:49                         ` Linus Torvalds
  0 siblings, 1 reply; 34+ messages in thread
From: Rik van Riel @ 2002-08-13 22:14 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Alan Cox, Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Rik van Riel wrote:
> On Tue, 13 Aug 2002, Linus Torvalds wrote:

> > Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues
> > were fixed.
>
> "If you can't measure it, it doesn't exist"
>
> *runs like hell*

As a clarification to this, I'm not suggesting that interactive
performance doesn't exist, I'm suggesting that we should measure
it.

I've got a few rough ideas on how to measure these things and
will send out a few benchmark proposals shortly.

regards,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 21:24                       ` Linus Torvalds
@ 2002-08-13 22:29                         ` Andrew Theurer
  2002-08-13 23:30                           ` Andrea Arcangeli
  2002-08-20  0:49                         ` [PATCH] NUMA-Q disable irqbalance Dave Hansen
  1 sibling, 1 reply; 34+ messages in thread
From: Andrew Theurer @ 2002-08-13 22:29 UTC (permalink / raw)
  To: Linus Torvalds, Martin J. Bligh; +Cc: Alan Cox, linux-kernel

> > People in our benchmarking group (Andrew, cc'ed) have told me that
> > reducing the frequency of IO-APIC reprogramming by a factor of 20 or
> > so improves performance greatly - don't know what HZ that was at, but
> > the whole thing seems a little overenthusiastic to me.

I thought I saw a big difference in 2.4, but my latest runs could not 
reproduce this.  I also tested Andrea's version 
http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.19rc3aa1/30_irq-balance-12
which addresses some of the problems:

[email to Andrea]
> Andrea,
> 
> I have some irqbalance results.  For some reason, Ingo's isn't performing as 
> bad as I thought on 2.4.  Your patch still performs better.  These results 
> are from NetBench, a CIFS network fileserver benchmark on a samba linux
> 4-way 
> P4 server:
> 
> 2.4.19-rc3aa3:
> 
> No Balance    Ingo IRQ Balance        Andrea IRQ Balance
> 794 Mbps              787 Mbps                        792 Mbps
> 
> With hyperthreading:
> 
> No Balance    Ingo IRQ Balance        Andrea IRQ Balance
> 773 Mbps              798 Mbps                        809 Mbps

Before I tried hyperthreading, no balance was always best performance, but 
"unfair".  However, with hyperthreading, (Andrea's) IRQbalance can be the 
best, probably because an interrupt rate of that level just can't be 
processed by "1/2" of a CPU and would benefit from a distrubution.  I did 
make a run on 2.5.25 a few weeks ago.  With IRQbalance turned off, I was 10% 
better.  I have not had a chance to try Andrea's in 2.5 but I suspect it 
would be best overall, since he compensates for HZ:
#define IRQ_BALANCE_INTERVAL (HZ/50) 
and does not seem to reprogram the IO-APIC as often.  IMO, I think Andrea's 
version is a little less aggressive and has less overhead, something I'd 
prefer in 2.5. 

-Andrew Theurer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 22:29                         ` Andrew Theurer
@ 2002-08-13 23:30                           ` Andrea Arcangeli
  2002-08-14 21:16                             ` James Cleverdon
  2002-08-23  2:31                             ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing James Cleverdon
  0 siblings, 2 replies; 34+ messages in thread
From: Andrea Arcangeli @ 2002-08-13 23:30 UTC (permalink / raw)
  To: Andrew Theurer; +Cc: Linus Torvalds, Martin J. Bligh, Alan Cox, linux-kernel

On Tue, Aug 13, 2002 at 05:29:50PM -0500, Andrew Theurer wrote:
> > 2.4.19-rc3aa3:
> > 
> > No Balance    Ingo IRQ Balance        Andrea IRQ Balance
> > 794 Mbps              787 Mbps                        792 Mbps
> > 
> > With hyperthreading:
> > 
> > No Balance    Ingo IRQ Balance        Andrea IRQ Balance
> > 773 Mbps              798 Mbps                        809 Mbps

thanks again for running the above benchmarks.

> version is a little less aggressive and has less overhead, something I'd 
> prefer in 2.5. 

Second that of course, btw, the detailed explanataion of the changes I
did while merging it can be found on lse-tech.

it is also possible HZ/50 is too high frequency still, I didn't run any
extensive test on the reprogramming frequency. I would suggest to try
with HZ/10 too (so every 100msec instead of every 20msec).

BTW, the very same algorithm should also be shared by alpha, alpha never
had hardware irq balancing support, it's like a p4, and we do static
routing distribution choosed by the kernel at boot which is been pretty
good so far (better than mainline 2.4 on a p4 smp) but the irqblanace
algorithm should be better there too.

Andrea

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:22                   ` Linus Torvalds
@ 2002-08-14  5:52                     ` Martin J. Bligh
  2002-08-14 10:10                     ` Jos Hulzink
  1 sibling, 0 replies; 34+ messages in thread
From: Martin J. Bligh @ 2002-08-14  5:52 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Matt Dobson

> But this is exactly the kinds of cases that config options do 
> _not_ work well for.

OK ... you're right ;-) This is bad, especially for distribution
kernels. So I just need something to stop the NUMA-Q crashing.
Can I have the appended? Please, please, please? ;-)
It just adds an if switch to irq_balance which the compiler
optimises away anyway. Not a whiff of a config option. 
Tested on 2.5.31.

M.

diff -urN -X /home/mbligh/.diff.exclude 2.5.31-virgin/arch/i386/kernel/io_apic.c 2.5.31-irqbalance2/arch/i386/kernel/io_apic.c
--- 2.5.31-virgin/arch/i386/kernel/io_apic.c	Sat Aug 10 18:41:26 2002
+++ 2.5.31-irqbalance2/arch/i386/kernel/io_apic.c	Tue Aug 13 22:32:49 2002
@@ -251,6 +251,9 @@
 	irq_balance_t *entry = irq_balance + irq;
 	unsigned long now = jiffies;
 
+	if (clustered_apic_mode)
+		return;
+
 	if (entry->timestamp != now) {
 		unsigned long allowed_mask;
 		int random_number;



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 20:22                   ` Linus Torvalds
  2002-08-14  5:52                     ` Martin J. Bligh
@ 2002-08-14 10:10                     ` Jos Hulzink
  2002-08-14 11:12                       ` David Lang
  1 sibling, 1 reply; 34+ messages in thread
From: Jos Hulzink @ 2002-08-14 10:10 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Linus Torvalds wrote:

> There are tons of reasons to run the same kernel on a multitude of
> machines, even ignoring the issue of things like installers etc.
>
> We had this CONFIG_xxxx disease when it came to SSE, we had it when it
> came to TSC, etc. And in every case it ended up being bad, simply because
> it's not the right interface for _users_.

True, but the nice thing about the linux kernel is that every little
detail can be modified as you like. I think it is very important to answer
the question what skills a person that wants to compile a kernel needs. If
you want to lower the threshold, this sure is an config option that
shouldn't be there.

Maybe the config system should provide an expert-mode to tweak stuff like
this, and enable / disable the irq balancing by default according to the
processor type selected.

Jos

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-14 10:10                     ` Jos Hulzink
@ 2002-08-14 11:12                       ` David Lang
  0 siblings, 0 replies; 34+ messages in thread
From: David Lang @ 2002-08-14 11:12 UTC (permalink / raw)
  To: Jos Hulzink; +Cc: Linus Torvalds, Martin J. Bligh, linux-kernel, Matt Dobson

while we're tweaking config options how about one that will turn on all
the features that fall into the catagory of 'if you have this it will
speed up your system, if you don't there's no performance penalty'

I definantly understand why some people will want to turn those off to
save memory, but it would be nice to have one thing to do to turn them all
on at once when memory isn't that tight.

David Lang

On Wed, 14 Aug 2002, Jos Hulzink wrote:

> Date: Wed, 14 Aug 2002 12:10:34 +0200 (CEST)
> From: Jos Hulzink <josh@stack.nl>
> To: Linus Torvalds <torvalds@transmeta.com>
> Cc: Martin J. Bligh <Martin.Bligh@us.ibm.com>,
>      linux-kernel <linux-kernel@vger.kernel.org>,
>      Matt Dobson <colpatch@us.ibm.com>
> Subject: Re: [PATCH] NUMA-Q disable irqbalance
>
> On Tue, 13 Aug 2002, Linus Torvalds wrote:
>
> > There are tons of reasons to run the same kernel on a multitude of
> > machines, even ignoring the issue of things like installers etc.
> >
> > We had this CONFIG_xxxx disease when it came to SSE, we had it when it
> > came to TSC, etc. And in every case it ended up being bad, simply because
> > it's not the right interface for _users_.
>
> True, but the nice thing about the linux kernel is that every little
> detail can be modified as you like. I think it is very important to answer
> the question what skills a person that wants to compile a kernel needs. If
> you want to lower the threshold, this sure is an config option that
> shouldn't be there.
>
> Maybe the config system should provide an expert-mode to tweak stuff like
> this, and enable / disable the irq balancing by default according to the
> processor type selected.
>
> Jos
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 22:14                       ` Rik van Riel
@ 2002-08-14 14:49                         ` Linus Torvalds
  2002-08-14 15:19                           ` Rik van Riel
  0 siblings, 1 reply; 34+ messages in thread
From: Linus Torvalds @ 2002-08-14 14:49 UTC (permalink / raw)
  To: Rik van Riel; +Cc: Alan Cox, Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Rik van Riel wrote:
> 
> As a clarification to this, I'm not suggesting that interactive
> performance doesn't exist, I'm suggesting that we should measure
> it.

I think the only way to measure it is with a latency measurement thing - 
like the one used for some of the RT tuning.

However, the latency measurement should not care too much about individual
millisecond latencies, but only holler when it finds _combined_ bad
latencies in the 1/10+ second range (which is human-perceptible).

One problem is trying to find a good load for the tester program itself
(it should not just sit in a tight loop, it should have a memory footprint
and some delays of its own).

		Linus

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-14 14:49                         ` Linus Torvalds
@ 2002-08-14 15:19                           ` Rik van Riel
  0 siblings, 0 replies; 34+ messages in thread
From: Rik van Riel @ 2002-08-14 15:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Alan Cox, Martin J. Bligh, linux-kernel, Matt Dobson

On Wed, 14 Aug 2002, Linus Torvalds wrote:
> On Tue, 13 Aug 2002, Rik van Riel wrote:
> >
> > As a clarification to this, I'm not suggesting that interactive
> > performance doesn't exist, I'm suggesting that we should measure
> > it.
>
> I think the only way to measure it is with a latency measurement thing -
> like the one used for some of the RT tuning.
>
> However, the latency measurement should not care too much about individual
> millisecond latencies, but only holler when it finds _combined_ bad
> latencies in the 1/10+ second range (which is human-perceptible).
>
> One problem is trying to find a good load for the tester program itself
> (it should not just sit in a tight loop, it should have a memory footprint
> and some delays of its own).

A start would be Bob Matthews's IRMAN program:

	http://people.redhat.com/bmatthews/irman/


I've also been talking with Randy Hwron(sp?) about doing
latency tests with all his benchmarks.

If I wasn't so distracted by various other TODO items I'd
already have a dbench with latency histograms in it ;)

(yes, I know dbench is a bad benchmark, but it also seems
to be the most abused one.  Making the thing show exactly
how much unfairness and bad latency there is will probably
make some people look at dbench results with very different
eyes)

cheers,

Rik
-- 
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/		http://distro.conectiva.com/


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 23:30                           ` Andrea Arcangeli
@ 2002-08-14 21:16                             ` James Cleverdon
  2002-08-23  2:31                             ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing James Cleverdon
  1 sibling, 0 replies; 34+ messages in thread
From: James Cleverdon @ 2002-08-14 21:16 UTC (permalink / raw)
  To: Andrea Arcangeli, Andrew Theurer
  Cc: Linus Torvalds, Martin J. Bligh, Alan Cox, linux-kernel

On Tuesday 13 August 2002 04:30 pm, Andrea Arcangeli wrote:
> On Tue, Aug 13, 2002 at 05:29:50PM -0500, Andrew Theurer wrote:
> > > 2.4.19-rc3aa3:
> > >
> > > No Balance    Ingo IRQ Balance        Andrea IRQ Balance
> > > 794 Mbps              787 Mbps                        792 Mbps
> > >
> > > With hyperthreading:
> > >
> > > No Balance    Ingo IRQ Balance        Andrea IRQ Balance
> > > 773 Mbps              798 Mbps                        809 Mbps
>
> thanks again for running the above benchmarks.
>
> > version is a little less aggressive and has less overhead, something I'd
> > prefer in 2.5.
>
> Second that of course, btw, the detailed explanataion of the changes I
> did while merging it can be found on lse-tech.
>
> it is also possible HZ/50 is too high frequency still, I didn't run any
> extensive test on the reprogramming frequency. I would suggest to try
> with HZ/10 too (so every 100msec instead of every 20msec).
>
> BTW, the very same algorithm should also be shared by alpha, alpha never
> had hardware irq balancing support, it's like a p4, and we do static
> routing distribution choosed by the kernel at boot which is been pretty
> good so far (better than mainline 2.4 on a p4 smp) but the irqblanace
> algorithm should be better there too.
>
> Andrea

I've been thinking about doing away with balance_irq on P4 boxes by using the 
TPR.  It might even help P3 and other i386 CPUs by routing interrupts 
preferentially to idle CPUs.  And, it would do it in real time, not as a 
snapshot of the past stored in the I/O APIC's dest masks.

What do you think about this?  (Crudely ripped out of my 2.5 summit patch; may 
not apply correctly.)

The code in do_IRQ could be "if (clustered_apic_mode) apic_adj_tpr(TPR_IRQ)" 
or some such, if that kind of APIC foolery is not considered necessary for 
the non-P4 crowd.  (The automatic priority boost for serial APICs with 
unEOIed interrupts should do the job for us.)


diff -ruN 2.5.24/arch/i386/kernel/irq.c d24/arch/i386/kernel/irq.c
--- 2.5.24/arch/i386/kernel/irq.c	Thu Jun 20 15:53:44 2002
+++ d24/arch/i386/kernel/irq.c	Wed Jul 10 13:34:14 2002
@@ -582,6 +582,7 @@
 	unsigned int status;
 
 	kstat.irqs[cpu][irq]++;
+	apic_adj_tpr(TPR_IRQ);
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
 	/*
@@ -642,6 +643,7 @@
 
 	if (softirq_pending(cpu))
 		do_softirq();
+	apic_adj_tpr(-TPR_IRQ);
 	return 1;
 }
 
diff -ruN 2.5.24/arch/i386/kernel/process.c d24/arch/i386/kernel/process.c
--- 2.5.24/arch/i386/kernel/process.c	Thu Jun 20 15:53:40 2002
+++ d24/arch/i386/kernel/process.c	Wed Jul 10 14:12:57 2002
@@ -145,7 +145,9 @@
 		irq_stat[smp_processor_id()].idle_timestamp = jiffies;
 		while (!need_resched())
 			idle();
+		apic_set_tpr(TPR_TASK);
 		schedule();
+		apic_set_tpr(TPR_IDLE);
 	}
 }
 
diff -ruN 2.5.24/include/asm-i386/apic.h d24/include/asm-i386/apic.h
--- 2.5.24/include/asm-i386/apic.h	Thu Jun 20 15:53:57 2002
+++ d24/include/asm-i386/apic.h	Wed Jul 10 13:34:14 2002
@@ -64,6 +64,22 @@
 	apic_write_around(APIC_EOI, 0);
 }
 
+static inline void apic_set_tpr(unsigned long val)
+{
+	unsigned long value;
+
+	value = apic_read(APIC_TASKPRI);
+	apic_write_around(APIC_TASKPRI, (value & ~APIC_TPRI_MASK) + val);
+}
+
+static inline void apic_adj_tpr(long adj)
+{
+	unsigned long value;
+
+	value = apic_read(APIC_TASKPRI);
+	apic_write_around(APIC_TASKPRI, value + adj);
+}
+
 extern int get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC (void);
@@ -95,6 +118,15 @@
 #define NMI_LOCAL_APIC	2
 #define NMI_INVALID	3
 
+#else /* CONFIG_X86_LOCAL_APIC */
+#define apic_set_tpr(val)
+#define apic_adj_tpr(adj)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+/* Priority values for apic_adj_tpr() and apic_set_tpr() */
+/* xAPICs only do priority comparisons on the upper nibble. */
+#define TPR_IDLE	(0x00ul)
+#define TPR_TASK	(0x10ul)
+#define TPR_IRQ		(0x20ul)	/* Or maybe 0x10 ?? */
+
 #endif /* __ASM_APIC_H */


-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 21:24                       ` Linus Torvalds
  2002-08-13 22:29                         ` Andrew Theurer
@ 2002-08-20  0:49                         ` Dave Hansen
  1 sibling, 0 replies; 34+ messages in thread
From: Dave Hansen @ 2002-08-20  0:49 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Martin J. Bligh, Alan Cox, linux-kernel, Andrew Theurer

[-- Attachment #1: Type: text/plain, Size: 2078 bytes --]

Linus Torvalds wrote:
> On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> 
>>Was that before or after you changed HZ to 1000? I *think* that increased
>>the frequency of IO-APIC reprogramming by a factor of 10, though I might
>>be misreading the code. If it does depend on HZ, I think that's bad.
> 
> The 1000Hz thing came much later, and I never noticed any impact of that 
> on my machines.
> 
> (Note that this is all entrely subjective. I was very disappointed in the
> feel of the first HT P4 machine I had for the first few weeks, but apart
> from running lmbench - which looked ok even though it shows that P4's are
> bad at system calls - I've not actually put numbers on it. But my feeling
> was that the irq thing made a noticeable difference. Caveat emptor -
> subjective feelings are not good).
> 
>>People in our benchmarking group (Andrew, cc'ed) have told me that
>>reducing the frequency of IO-APIC reprogramming by a factor of 20 or
>>so improves performance greatly - don't know what HZ that was at, but
>>the whole thing seems a little overenthusiastic to me.
> 
> The rebalancing was certainly done with a 100Hz clock, so yes, it might 
> have become much worse lately.

Here's a patch from Andrea's tree that uses IRQ_BALANCE_INTERVAL to 
define how often interrupts are balanced, staying independent from HZ. 
  It also makes sure that there _is_ a change to the configuration 
before it actually writes it.  It reminds me of the mod_timer 
optimization.

While observing the affect that this has on /proc/interrupts, I 
noticed that timer interrupts aren't very balanced across my 8 CPUs.

stock 2.5.30 kernel:
       CPU0  CPU1  CPU2  CPU3  CPU4  CPU5    CPU6    CPU7
   0:213935 63894 32373  1322  1900  1779    2895  181441  timer

2.5.31 w/attached patch
       CPU0  CPU1  CPU2  CPU3 CPU4   CPU5    CPU6    CPU7
   0: 19322  8778 35805 36771 9219  14091 1091741 1092294   timer

So, my patch didn't cause this situation.  I don't know if it is 
normal, but the other irq's were much more balanced than the timer one.
-- 
Dave Hansen
haveblue@us.ibm.com

[-- Attachment #2: irq-balance-tune-2.5.31+bk-0.patch --]
[-- Type: text/plain, Size: 1730 bytes --]

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.489   -> 1.490  
#	arch/i386/kernel/io_apic.c	1.26    -> 1.27   
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/08/19	haveblue@elm3b96.(none)	1.490
# Reduce irqbalance's dependence on HZ.  Only write to io_apic when there is actually
# a change to write into it.  
# --------------------------------------------
#
diff -Nru a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
--- a/arch/i386/kernel/io_apic.c	Mon Aug 19 14:49:03 2002
+++ b/arch/i386/kernel/io_apic.c	Mon Aug 19 14:49:03 2002
@@ -220,6 +220,9 @@
 		((1 << cpu) & (allowed_mask))
 
 #if CONFIG_SMP
+
+#define IRQ_BALANCE_INTERVAL (HZ/50)
+	
 static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
 {
 	int search_idle = 1;
@@ -254,8 +257,9 @@
 	if (clustered_apic_mode)
 		return;
 
-	if (entry->timestamp != now) {
+	if (unlikely(time_after(now, entry->timestamp + IRQ_BALANCE_INTERVAL))) {
 		unsigned long allowed_mask;
+		unsigned int new_cpu;
 		int random_number;
 
 		rdtscl(random_number);
@@ -263,8 +267,11 @@
 
 		allowed_mask = cpu_online_map & irq_affinity[irq];
 		entry->timestamp = now;
-		entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
-		set_ioapic_affinity(irq, 1 << entry->cpu);
+		new_cpu = move(entry->cpu, allowed_mask, now, random_number);
+		if (entry->cpu != new_cpu) {
+			entry->cpu = new_cpu;
+			set_ioapic_affinity(irq, 1 << new_cpu);
+		}
 	}
 }
 #else /* !SMP */

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
  2002-08-13 23:30                           ` Andrea Arcangeli
  2002-08-14 21:16                             ` James Cleverdon
@ 2002-08-23  2:31                             ` James Cleverdon
  1 sibling, 0 replies; 34+ messages in thread
From: James Cleverdon @ 2002-08-23  2:31 UTC (permalink / raw)
  To: Andrea Arcangeli, Andrew Theurer, acpi-devel
  Cc: Linus Torvalds, Martin J. Bligh, Alan Cox, linux-kernel,
	Dave Jones

Here's my first cut of the 2.5 summit patch that allows you to boot the x440 
NUMA box and actually get all CPUs on-line.  While similar to the patch in 
Alan's 2.4 tree (and in SuSE 8.0), this patch uses logical mode interrupts so 
that we can make the TPR hardware to do real time IRQ routing to less busy 
CPUs.  As a result, this code may do P3 and earlier systems some good as 
well.  No need for the balance_irq function (which is crudely commented out) 
on P4 boxen.

What's the catch?  I'm glad you asked.  On my test systems this drops all SCSI 
interrupts when the ACPI hyperthreading-only config option is turned on.  The 
system boots fine when turned off, using the MPS table.  Funny thing:  the 
IRQ table shows 38 entries with MPS but only 18 for ACPI -- just about what 
you'd expect for the legacy IRQs plus some interrupt source overrides.  Does 
anyone know if this is expected behavior?  If not, what happened to the other 
IRQs?

Note:  I can't do a thing about the xAPIC bridge HW's tie breaker rule.  On 
idle systems the lowest numbered CPU in each APIC cluster is going to be hit 
by most of the interrupts.  So what?  It was idle anyway.  On busier systems, 
the interrupt counts start evening out.  So, folks should not expect 
balance_irq's nicely spread IRQ counts across all CPUs, but can hopefully 
enjoy some performance gains instead.

Anyway, here it is.  Applies to 2.5.31.  Comments and advice are very welcome:

diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
--- 2.5.31/arch/i386/kernel/acpi.c	Sat Aug 10 18:41:53 2002
+++ s31/arch/i386/kernel/acpi.c	Wed Aug 14 19:30:13 2002
@@ -114,6 +114,7 @@
 	unsigned long		size)
 {
 	struct acpi_table_madt	*madt = NULL;
+	extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);
 
 	if (!phys_addr || !size)
 		return -EINVAL;
@@ -130,6 +131,8 @@
 	printk(KERN_INFO PREFIX "Local APIC address 0x%08x\n",
 		madt->lapic_address);
 
+	acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+
 	return 0;
 }
 
@@ -301,6 +304,7 @@
 	char			*cmdline)
 {
 	int			result = 0;
+	extern void		smp_cluster_apic_check(void);
 
 	/*
 	 * The default interrupt routing model is PIC (8259).  This gets
@@ -416,8 +420,10 @@
 #endif /*CONFIG_X86_IO_APIC*/
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	if (acpi_lapic && acpi_ioapic)
+	if (acpi_lapic && acpi_ioapic) {
 		smp_found_config = 1;
+		smp_cluster_apic_check();
+	}
 #endif
 
 	return 0;
diff -ruN 2.5.31/arch/i386/kernel/apic.c s31/arch/i386/kernel/apic.c
--- 2.5.31/arch/i386/kernel/apic.c	Sat Aug 10 18:41:29 2002
+++ s31/arch/i386/kernel/apic.c	Wed Aug 14 19:30:13 2002
@@ -29,6 +29,7 @@
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
+#include <asm/smpboot.h>
 
 /* Using APIC to generate smp_local_timer_interrupt? */
 int using_apic_timer = 0;
@@ -272,6 +273,16 @@
 	apic_write_around(APIC_LVT1, value);
 }
 
+static inline unsigned long apic_ldr_value(unsigned long value)
+{
+	if (clustered_apic_numaq)
+		return (value);
+	if (clustered_apic_xapic)
+		return (((value) & ~APIC_LDR_MASK) |
+			SET_APIC_LOGICAL_ID(physical_to_logical_apicid(hard_smp_processor_id())));
+	return (((value) & ~APIC_LDR_MASK) | SET_APIC_LOGICAL_ID(1UL << 
smp_processor_id()));
+}
+
 void __init setup_local_APIC (void)
 {
 	unsigned long value, ver, maxlvt;
@@ -304,21 +315,22 @@
 	 * document number 292116).  So here it goes...
 	 */
 
-	if (!clustered_apic_mode) {
+	if (!clustered_apic_numaq) {
 		/*
-		 * In clustered apic mode, the firmware does this for us 
-		 * Put the APIC into flat delivery mode.
-		 * Must be "all ones" explicitly for 82489DX.
+		 * For NUMA-Q, the firmware does this for us.  Otherwise, put the APIC into 
clustered or flat
+		 *
+		 * delivery mode.  Must be "all ones" explicitly for 82489DX.
 		 */
-		apic_write_around(APIC_DFR, 0xffffffff);
+		if (clustered_apic_mode)
+			apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
+		else
+			apic_write_around(APIC_DFR, APIC_DFR_FLAT);
 
 		/*
 		 * Set up the logical destination ID.
 		 */
 		value = apic_read(APIC_LDR);
-		value &= ~APIC_LDR_MASK;
-		value |= (1<<(smp_processor_id()+24));
-		apic_write_around(APIC_LDR, value);
+		apic_write_around(APIC_LDR, apic_ldr_value(value));
 	}
 
 	/*
diff -ruN 2.5.31/arch/i386/kernel/io_apic.c s31/arch/i386/kernel/io_apic.c
--- 2.5.31/arch/i386/kernel/io_apic.c	Sat Aug 10 18:41:26 2002
+++ s31/arch/i386/kernel/io_apic.c	Wed Aug 14 19:30:13 2002
@@ -35,6 +35,7 @@
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
+#include <asm/smpboot.h>
 
 #undef APIC_LOCKUP_DEBUG
 
@@ -261,7 +262,7 @@
 		allowed_mask = cpu_online_map & irq_affinity[irq];
 		entry->timestamp = now;
 		entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
-		set_ioapic_affinity(irq, 1 << entry->cpu);
+		set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));
 	}
 }
 #else /* !SMP */
@@ -682,9 +683,40 @@
 	return current_vector;
 }
 
+/*
+ * round_robin_cpu_apic_id -- Since i386 Linux doesn't use the APIC TPRs to
+ * set task/interrupt priority, xAPICs' tiebreaker rule tends to hit one CPU
+ * with all interrupts for each quad.  Distribute the interrupts using a
+ * simple round robin scheme.
+ */
+static int round_robin_cpu_apic_id(void)
+{
+	int val;
+	static unsigned	next_cpu = 0;
+
+	if (next_cpu >= NR_CPUS || cpu_2_logical_apicid[next_cpu] == BAD_APICID)
+		next_cpu = 0;
+	val = cpu_present_to_apicid(next_cpu) | APIC_DEST_CPUS_MASK;
+	++next_cpu;
+	return (val);
+}
+
+static inline int target_cpus(void)
+{
+	if (clustered_apic_numaq)
+		return APIC_BROADCAST_ID_APIC;	/* broadcast to local quad */
+	if (clustered_apic_xapic)
+		return round_robin_cpu_apic_id();
+	return logical_cpu_present_map & 0xFFu;
+//	return cpu_online_map;
+}
+
 static struct hw_interrupt_type ioapic_level_irq_type;
 static struct hw_interrupt_type ioapic_edge_irq_type;
 
+#undef KERN_DEBUG
+#define KERN_DEBUG
+
 void __init setup_IO_APIC_irqs(void)
 {
 	struct IO_APIC_route_entry entry;
@@ -702,9 +734,9 @@
 		memset(&entry,0,sizeof(entry));
 
 		entry.delivery_mode = dest_LowestPrio;
-		entry.dest_mode = INT_DELIVERY_MODE;
+		entry.dest_mode = INT_DEST_ADDR_MODE;
 		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest = TARGET_CPUS;
+		entry.dest.logical.logical_dest = target_cpus();
 
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
@@ -722,7 +754,6 @@
 		if (irq_trigger(idx)) {
 			entry.trigger = 1;
 			entry.mask = 1;
-			entry.dest.logical.logical_dest = TARGET_CPUS;
 		}
 
 		irq = pin_2_irq(idx, apic, pin);
@@ -782,9 +813,9 @@
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
-	entry.dest_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = INT_DEST_ADDR_MODE;
 	entry.mask = 0;					/* unmask IRQ now */
-	entry.dest.logical.logical_dest = TARGET_CPUS;
+	entry.dest.logical.logical_dest = target_cpus();
 	entry.delivery_mode = dest_LowestPrio;
 	entry.polarity = 0;
 	entry.trigger = 0;
@@ -1141,7 +1172,7 @@
 		
 		old_id = mp_ioapics[apic].mpc_apicid;
 
-		if (mp_ioapics[apic].mpc_apicid >= 0xf) {
+		if (mp_ioapics[apic].mpc_apicid >= apic_broadcast_id) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
 				apic, mp_ioapics[apic].mpc_apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
@@ -1153,14 +1184,16 @@
 		 * Sanity check, is the ID really free? Every APIC in a
 		 * system must have a unique ID or we get lots of nice
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
+		 * I/O APIC IDs no longer have any meaning for xAPICs.
 		 */
-		if (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid)) {
+		if (!clustered_apic_xapic &&
+		    (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid))) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
 				apic, mp_ioapics[apic].mpc_apicid);
 			for (i = 0; i < 0xf; i++)
 				if (!(phys_id_present_map & (1 << i)))
 					break;
-			if (i >= 0xf)
+			if (i >= apic_broadcast_id)
 				panic("Max APIC ID exceeded!\n");
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
@@ -1288,7 +1321,7 @@
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
-	balance_irq(irq);
+//	balance_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1328,7 +1361,7 @@
 	unsigned long v;
 	int i;
 
-	balance_irq(irq);
+//	balance_irq(irq);
 /*
  * It appears there is an erratum which affects at least version 0x11
  * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -1849,8 +1882,8 @@
 	memset(&entry,0,sizeof(entry));
 
 	entry.delivery_mode = dest_LowestPrio;
-	entry.dest_mode = INT_DELIVERY_MODE;
-	entry.dest.logical.logical_dest = TARGET_CPUS;
+	entry.dest_mode = INT_DEST_ADDR_MODE;
+	entry.dest.logical.logical_dest = target_cpus();
 	entry.mask = 1;					 /* Disabled (masked) */
 	entry.trigger = 1;				   /* Level sensitive */
 	entry.polarity = 1;					/* Low active */
diff -ruN 2.5.31/arch/i386/kernel/irq.c s31/arch/i386/kernel/irq.c
--- 2.5.31/arch/i386/kernel/irq.c	Sat Aug 10 18:41:19 2002
+++ s31/arch/i386/kernel/irq.c	Thu Aug 22 17:48:15 2002
@@ -332,6 +332,7 @@
 
 	irq_enter();
 	kstat.irqs[cpu][irq]++;
+	apic_adj_tpr(TPR_IRQ);
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
 	/*
@@ -389,6 +390,7 @@
 	 */
 	desc->handler->end(irq);
 	spin_unlock(&desc->lock);
+	apic_adj_tpr(-TPR_IRQ);
 
 	irq_exit();
 
diff -ruN 2.5.31/arch/i386/kernel/mpparse.c s31/arch/i386/kernel/mpparse.c
--- 2.5.31/arch/i386/kernel/mpparse.c	Sat Aug 10 18:41:25 2002
+++ s31/arch/i386/kernel/mpparse.c	Wed Aug 14 19:30:13 2002
@@ -30,6 +30,7 @@
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/io_apic.h>
+#include <asm/smpboot.h>
 
 /* Have we found an MP table */
 int smp_found_config;
@@ -68,6 +69,13 @@
 
 /* Bitmask of physically existing CPUs */
 unsigned long phys_cpu_present_map;
+unsigned long logical_cpu_present_map;
+
+u32 apic_broadcast_id = APIC_BROADCAST_ID_APIC;
+u8 clustered_apic_mode = 0;
+u8 esr_disable = 0;
+u8 raw_phys_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+static u8 clustered_hint = 0;
 
 /*
  * Intel MP BIOS table parsing routines:
@@ -104,8 +112,8 @@
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
 		return;
 
-	logical_apicid = m->mpc_apicid;
-	if (clustered_apic_mode) {
+	logical_apicid = 0x01;
+	if (clustered_apic_numaq) {
 		quad = translation_table[mpc_record]->trans_quad;
 		logical_apicid = (quad << 4) + 
 			(m->mpc_apicid ? m->mpc_apicid << 1 : 1);
@@ -186,11 +194,8 @@
 	}
 	ver = m->mpc_apicver;
 
-	if (clustered_apic_mode) {
-		phys_cpu_present_map |= (logical_apicid&0xf) << (4*quad);
-	} else {
-		phys_cpu_present_map |= 1 << m->mpc_apicid;
-	}
+	logical_cpu_present_map |= 1 << (num_processors-1);
+	phys_cpu_present_map |= apicid_to_phys_cpu_present(m->mpc_apicid);
 	/*
 	 * Validate version
 	 */
@@ -199,6 +204,7 @@
 		ver = 0x10;
 	}
 	apic_version[m->mpc_apicid] = ver;
+	raw_phys_apicid[num_processors - 1] = m->mpc_apicid;
 }
 
 static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -209,7 +215,7 @@
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 	
-	if (clustered_apic_mode) {
+	if (clustered_apic_numaq) {
 		quad = translation_table[mpc_record]->trans_quad;
 		mp_bus_id_to_node[m->mpc_busid] = quad;
 		mp_bus_id_to_local[m->mpc_busid] = 
translation_table[mpc_record]->trans_local;
@@ -253,6 +259,15 @@
 	}
 	mp_ioapics[nr_ioapics] = *m;
 	nr_ioapics++;
+	/******
+	 * Warning!  We have an APIC version number collision between the APICs
+	 * on Scorpio-based NUMA-Q boxes and Summit xAPICs.  Intel didn't
+	 * define the xAPIC ver ID range until late in the development cycle,
+	 * so there is working silicon out there that doesn't match it.
+	 * A test in smp_cluster_apic_check() resolves the above conflict.
+	 ******/
+	if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <= XAPIC_VER_HIGH)
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
 }
 
 static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
@@ -348,12 +363,39 @@
 }
 
 /*
+ * Common code for MPS and ACPI/MADT.
+ */
+void __init smp_cluster_apic_check(void)
+{
+	int i;
+	u8 cluster;
+	static const char *mode_names[] = {
+		"Flat", "Clustered NUMA-Q", "Clustered xAPIC", "???"
+	};
+
+	if (clustered_hint) {
+		if (clustered_hint & CLUSTERED_APIC_NUMAQ) {
+			/* NUMA-Q boxes never had xAPICs */
+			clustered_hint &= ~CLUSTERED_APIC_XAPIC;
+		}
+		clustered_apic_mode = clustered_hint;
+		esr_disable = 1;
+		if (clustered_apic_xapic)
+			apic_broadcast_id = APIC_BROADCAST_ID_XAPIC;
+		phys_cpu_present_map = logical_cpu_present_map;
+	}
+	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
+		mode_names[clustered_apic_mode], nr_ioapics);
+}
+
+/*
  * Read/parse the MPC
  */
 
 static int __init smp_read_mpc(struct mp_config_table *mpc)
 {
-	char str[16];
+	char oem[10];
+	char prod[14];
 	int count=sizeof(*mpc);
 	unsigned char *mpt=((unsigned char *)mpc)+count;
 
@@ -378,13 +440,21 @@
 		printk(KERN_ERR "SMP mptable: null local APIC address!\n");
 		return 0;
 	}
-	memcpy(str,mpc->mpc_oem,8);
-	str[8]=0;
-	printk("OEM ID: %s ",str);
-
-	memcpy(str,mpc->mpc_productid,12);
-	str[12]=0;
-	printk("Product ID: %s ",str);
+	memcpy(oem, mpc->mpc_oem, 8);
+	oem[8] = 0;
+	memcpy(prod, mpc->mpc_productid, 12);
+	prod[12] = 0;
+	printk("OEM ID: %s ", oem);
+	printk("Product ID: %s ",prod);
+	/*
+	 * Can't recognize Summit xAPICs (see MP_ioapic_info), so use
+	 * OEM/Product IDs.
+	 */
+	if (!strncmp(oem, "IBM ENSW", 8) &&
+	    (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+	else if (!strncmp(oem, "IBM NUMA", 8))
+		clustered_hint |= CLUSTERED_APIC_NUMAQ;
 
 	printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
 
@@ -395,7 +465,7 @@
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
 
-	if (clustered_apic_mode && mpc->mpc_oemptr) {
+	if (clustered_apic_numaq && mpc->mpc_oemptr) {
 		/* We need to process the oem mpc tables to tell us which quad things are 
in ... */
 		mpc_record = 0;
 		smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, 
mpc->mpc_oemsize);
@@ -463,6 +533,7 @@
 		}
 		++mpc_record;
 	}
+	smp_cluster_apic_check();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
@@ -934,6 +1005,17 @@
 		mp_ioapic_routing[idx].irq_start,
 		mp_ioapic_routing[idx].irq_end);
 
+	/******
+	 * Warning!  We have an APIC version number collision between the APICs
+	 * on Scorpio-based NUMA-Q boxes and Summit xAPICs.  Intel didn't
+	 * define the xAPIC ver ID range until late in the development cycle,
+	 * so there is working silicon out there that doesn't match it.
+	 * A test in smp_cluster_apic_check() resolves the above conflict.
+	 ******/
+	if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
+	    mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+
 	return;
 }
 
@@ -1051,6 +1133,13 @@
 	return;
 }
 
+/* Hook from generic ACPI tables.c */
+void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8))
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+}
+
 #ifdef CONFIG_ACPI_PCI
 
 void __init mp_parse_prt (void)
diff -ruN 2.5.31/arch/i386/kernel/process.c s31/arch/i386/kernel/process.c
--- 2.5.31/arch/i386/kernel/process.c	Sat Aug 10 18:41:15 2002
+++ s31/arch/i386/kernel/process.c	Wed Aug 14 19:30:13 2002
@@ -145,7 +145,9 @@
 		irq_stat[smp_processor_id()].idle_timestamp = jiffies;
 		while (!need_resched())
 			idle();
+		apic_set_tpr(TPR_TASK);
 		schedule();
+		apic_set_tpr(TPR_IDLE);
 	}
 }
 
@@ -197,7 +199,7 @@
 			}
 				/* we will leave sorting out the final value 
 				when we are ready to reboot, since we might not
- 				have set up boot_cpu_id or smp_num_cpu */
+ 				have set up boot_cpu_physical_apicid or smp_num_cpu */
 			break;
 #endif
 		}
diff -ruN 2.5.31/arch/i386/kernel/smpboot.c s31/arch/i386/kernel/smpboot.c
--- 2.5.31/arch/i386/kernel/smpboot.c	Sat Aug 10 18:41:28 2002
+++ s31/arch/i386/kernel/smpboot.c	Wed Aug 14 19:30:13 2002
@@ -498,59 +498,23 @@
 	return do_fork(CLONE_VM|CLONE_IDLETASK, 0, &regs, 0);
 }
 
-/* which physical APIC ID maps to which logical CPU number */
-volatile int physical_apicid_2_cpu[MAX_APICID];
 /* which logical CPU number maps to which physical APIC ID */
-volatile int cpu_2_physical_apicid[NR_CPUS];
+volatile u8 cpu_2_physical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID 
};
 
-/* which logical APIC ID maps to which logical CPU number */
-volatile int logical_apicid_2_cpu[MAX_APICID];
 /* which logical CPU number maps to which logical APIC ID */
-volatile int cpu_2_logical_apicid[NR_CPUS];
+volatile u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID 
};
 
-static inline void init_cpu_to_apicid(void)
-/* Initialize all maps between cpu number and apicids */
-{
-	int apicid, cpu;
-
-	for (apicid = 0; apicid < MAX_APICID; apicid++) {
-		physical_apicid_2_cpu[apicid] = -1;
-		logical_apicid_2_cpu[apicid] = -1;
-	}
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		cpu_2_physical_apicid[cpu] = -1;
-		cpu_2_logical_apicid[cpu] = -1;
-	}
-}
 
-static inline void map_cpu_to_boot_apicid(int cpu, int apicid)
-/* 
- * set up a mapping between cpu and apicid. Uses logical apicids for 
multiquad,
- * else physical apic ids
- */
+static inline void map_cpu_to_boot_apicid(int cpu, u8 phys_apicid, u8 
log_apicid)
 {
-	if (clustered_apic_mode) {
-		logical_apicid_2_cpu[apicid] = cpu;	
-		cpu_2_logical_apicid[cpu] = apicid;
-	} else {
-		physical_apicid_2_cpu[apicid] = cpu;	
-		cpu_2_physical_apicid[cpu] = apicid;
-	}
+	cpu_2_logical_apicid[cpu] = log_apicid;
+	cpu_2_physical_apicid[cpu] = phys_apicid;
 }
 
-static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid)
-/* 
- * undo a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
- */
+static inline void unmap_cpu_to_boot_apicid(int cpu, u8 phys_apicid, u8 
log_apicid)
 {
-	if (clustered_apic_mode) {
-		logical_apicid_2_cpu[apicid] = -1;	
-		cpu_2_logical_apicid[cpu] = -1;
-	} else {
-		physical_apicid_2_cpu[apicid] = -1;	
-		cpu_2_physical_apicid[cpu] = -1;
-	}
+	cpu_2_logical_apicid[cpu] = BAD_APICID;
+	cpu_2_physical_apicid[cpu] = BAD_APICID;
 }
 
 #if APIC_DEBUG
@@ -764,7 +728,7 @@
 
 extern unsigned long cpu_initialized;
 
-static void __init do_boot_cpu (int apicid) 
+static void __init do_boot_cpu(u8 phys_apicid, u8 log_apicid)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -774,7 +738,7 @@
 	unsigned long boot_error = 0;
 	int timeout, cpu;
 	unsigned long start_eip;
-	unsigned short nmi_high, nmi_low;
+	unsigned short nmi_high = 0, nmi_low = 0;
 
 	cpu = ++cpucount;
 	/*
@@ -791,7 +755,7 @@
 	 */
 	init_idle(idle, cpu);
 
-	map_cpu_to_boot_apicid(cpu, apicid);
+	map_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 
@@ -801,7 +765,8 @@
 	start_eip = setup_trampoline();
 
 	/* So we see what's up   */
-	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+	printk("Booting processor %d/0x%02X/0x%02X eip 0x%lX\n",
+		cpu, phys_apicid, log_apicid, start_eip);
 	stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle->thread_info);
 
 	/*
@@ -813,7 +778,7 @@
 
 	Dprintk("Setting warm reset code and vector.\n");
 
-	if (clustered_apic_mode) {
+	if (clustered_apic_numaq) {
 		/* stash the current NMI vector, so we can put things back */
 		nmi_high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
 		nmi_low = *((volatile unsigned short *) TRAMPOLINE_LOW);
@@ -830,7 +795,7 @@
 	/*
 	 * Be paranoid about clearing APIC errors.
 	 */
-	if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[apicid])) {
+	if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[phys_apicid])) {
 		apic_read_around(APIC_SPIV);
 		apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
@@ -845,10 +810,10 @@
 	 * Starting actual IPI sequence...
 	 */
 
-	if (clustered_apic_mode)
-		boot_error = wakeup_secondary_via_NMI(apicid);
-	else 
-		boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
+	if (clustered_apic_numaq)
+		boot_error = wakeup_secondary_via_NMI(log_apicid);
+	else
+		boot_error = wakeup_secondary_via_INIT(phys_apicid, start_eip);
 
 	if (!boot_error) {
 		/*
@@ -883,14 +848,15 @@
 				/* trampoline code not run */
 				printk("Not responding.\n");
 #if APIC_DEBUG
+			/* xAPICs don't do remote inquiries. */
 			if (!clustered_apic_mode)
-				inquire_remote_apic(apicid);
+				inquire_remote_apic(phys_apicid);
 #endif
 		}
 	}
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
-		unmap_cpu_to_boot_apicid(cpu, apicid);
+		unmap_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);
 		clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
 		clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
 		cpucount--;
@@ -899,7 +865,7 @@
 	/* mark "stuck" area as not stuck */
 	*((volatile unsigned long *)phys_to_virt(8192)) = 0;
 
-	if(clustered_apic_mode) {
+	if (clustered_apic_numaq) {
 		printk("Restoring NMI vector\n");
 		*((volatile unsigned short *) TRAMPOLINE_HIGH) = nmi_high;
 		*((volatile unsigned short *) TRAMPOLINE_LOW) = nmi_low;
@@ -958,7 +924,6 @@
 extern int prof_old_multiplier[NR_CPUS];
 extern int prof_counter[NR_CPUS];
 
-static int boot_cpu_logical_apicid;
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
@@ -966,9 +931,11 @@
 
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
-	int apicid, cpu, bit;
+	int cpu, bit;
+	u8 phys_apicid, log_apicid;
 
-        if (clustered_apic_mode && (numnodes > 1)) {
+#ifdef CONFIG_MULTIQUAD
+        if (clustered_apic_numaq && (numnodes > 1)) {
                 printk("Remapping cross-quad port I/O for %d quads\n",
 			numnodes);
                 printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
@@ -977,6 +944,7 @@
                 xquad_portio = ioremap (XQUAD_PORTIO_BASE, 
 			numnodes * XQUAD_PORTIO_LEN);
         }
+#endif
 
 #ifdef CONFIG_MTRR
 	/*  Must be done before other processors booted  */
@@ -993,8 +961,6 @@
 		prof_multiplier[cpu] = 1;
 	}
 
-	init_cpu_to_apicid();
-
 	/*
 	 * Setup boot CPU information
 	 */
@@ -1007,8 +973,14 @@
 	 */
 	set_bit(0, &cpu_online_map);
 	set_bit(0, &cpu_callout_map);
-	boot_cpu_logical_apicid = logical_smp_processor_id();
-	map_cpu_to_boot_apicid(0, boot_cpu_apicid);
+	if (clustered_apic_xapic)
+		boot_cpu_logical_apicid = 
physical_to_logical_apicid(boot_cpu_physical_apicid);
+	else if (clustered_apic_numaq)
+		boot_cpu_logical_apicid = logical_smp_processor_id();
+	else
+		boot_cpu_logical_apicid = 0x01;
+	map_cpu_to_boot_apicid(0, boot_cpu_physical_apicid, 
boot_cpu_logical_apicid);
+printk("Boot CPU #0/0x%02X/0x%02X\n", boot_cpu_physical_apicid, 
boot_cpu_logical_apicid);
 
 	current_thread_info()->cpu = 0;
 	smp_tune_scheduling();
@@ -1085,28 +1057,44 @@
 	 */
 	Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
 
-	for (bit = 0; bit < NR_CPUS; bit++) {
-		apicid = cpu_present_to_apicid(bit);
+	for (cpu = 1, bit = 0; bit < NR_CPUS; bit++) {
+		if (!(logical_cpu_present_map & (1ul << bit)))
+			continue;
+		if ((max_cpus >= 0) && (max_cpus <= cpucount + 1))
+			continue;
+		phys_apicid = raw_phys_apicid[bit];
 		/*
 		 * Don't even attempt to start the boot CPU!
 		 */
-		if (apicid == boot_cpu_apicid)
+		if (phys_apicid == boot_cpu_physical_apicid)
 			continue;
-
-		if (!(phys_cpu_present_map & (1 << bit)))
-			continue;
-		if (max_cpus <= cpucount+1)
+		if (phys_apicid == BAD_APICID)
 			continue;
+		if (clustered_apic_xapic)
+			log_apicid = (u8)physical_to_logical_apicid(phys_apicid);
+		else if (clustered_apic_numaq)
+			log_apicid = ((bit >> 2) << 4) | (1 << (bit & 0x3));
+		else {
+			/* Yes, this overflows if cpu > 7.  The APIC
+			 * destination register is only 8 bits wide.
+			 * For more than 8 CPUs, must use clustered mode. */
+			log_apicid = 1u << cpu;
+			if (log_apicid == 0)
+				BUG();
+		}
 
-		do_boot_cpu(apicid);
+		do_boot_cpu(phys_apicid, log_apicid);
 
 		/*
 		 * Make sure we unmap all failed CPUs
 		 */
-		if ((boot_apicid_to_cpu(apicid) == -1) &&
-				(phys_cpu_present_map & (1 << bit)))
-			printk("CPU #%d not responding - cannot use it.\n",
-								apicid);
+		if ((cpu_2_physical_apicid[cpu] == BAD_APICID) &&
+				(logical_cpu_present_map & (1ul << bit))) {
+			printk("CPU #%d/0x%02X/0x%02X not responding - cannot use it.\n",
+					bit, phys_apicid, log_apicid);
+			logical_cpu_present_map &= ~(1ul << bit);
+		} else
+			++cpu;		/* Got a live one. */
 	}
 
 	/*
diff -ruN 2.5.31/arch/i386/kernel/trampoline.S 
s31/arch/i386/kernel/trampoline.S
--- 2.5.31/arch/i386/kernel/trampoline.S	Sat Aug 10 18:41:27 2002
+++ s31/arch/i386/kernel/trampoline.S	Wed Aug 14 19:30:13 2002
@@ -36,9 +36,7 @@
 
 ENTRY(trampoline_data)
 r_base = .
-#ifdef CONFIG_MULTIQUAD
 	wbinvd
-#endif /* CONFIG_MULTIQUAD */
 	mov	%cs, %ax	# Code and data in the same place
 	mov	%ax, %ds
 
diff -ruN 2.5.31/include/asm-i386/apic.h s31/include/asm-i386/apic.h
--- 2.5.31/include/asm-i386/apic.h	Sat Aug 10 18:42:05 2002
+++ s31/include/asm-i386/apic.h	Wed Aug 14 19:31:11 2002
@@ -64,6 +64,22 @@
 	apic_write_around(APIC_EOI, 0);
 }
 
+static inline void apic_set_tpr(unsigned long val)
+{
+	unsigned long value;
+
+	value = apic_read(APIC_TASKPRI);
+	apic_write_around(APIC_TASKPRI, (value & ~APIC_TPRI_MASK) + val);
+}
+
+static inline void apic_adj_tpr(long adj)
+{
+	unsigned long value;
+
+	value = apic_read(APIC_TASKPRI);
+	apic_write_around(APIC_TASKPRI, value + adj);
+}
+
 extern int get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC (void);
@@ -96,6 +112,15 @@
 #define NMI_LOCAL_APIC	2
 #define NMI_INVALID	3
 
+#else /* CONFIG_X86_LOCAL_APIC */
+#define apic_set_tpr(val)
+#define apic_adj_tpr(adj)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+/* Priority values for apic_adj_tpr() and apic_set_tpr() */
+/* xAPICs only do priority comparisons on the upper nibble. */
+#define TPR_IDLE	(0x00L)
+#define TPR_TASK	(0x10L)
+#define TPR_IRQ		(0x10L)
+
 #endif /* __ASM_APIC_H */
diff -ruN 2.5.31/include/asm-i386/apicdef.h s31/include/asm-i386/apicdef.h
--- 2.5.31/include/asm-i386/apicdef.h	Sat Aug 10 18:41:36 2002
+++ s31/include/asm-i386/apicdef.h	Wed Aug 14 19:30:13 2002
@@ -11,8 +11,10 @@
 #define		APIC_DEFAULT_PHYS_BASE	0xfee00000
  
 #define		APIC_ID		0x20
-#define			APIC_ID_MASK		(0x0F<<24)
-#define			GET_APIC_ID(x)		(((x)>>24)&0x0F)
+#define			APIC_ID_MASK		(0xFF<<24)
+#define			GET_APIC_ID(x)		(((x)>>24)&0xFF)
+#define				XAPIC_VER_LOW	0x14	/* Version num range */
+#define				XAPIC_VER_HIGH	0x1F
 #define		APIC_LVR	0x30
 #define			APIC_LVR_MASK		0xFF00FF
 #define			GET_APIC_VERSION(x)	((x)&0xFF)
@@ -32,6 +34,8 @@
 #define			SET_APIC_LOGICAL_ID(x)	(((x)<<24))
 #define			APIC_ALL_CPUS		0xFF
 #define		APIC_DFR	0xE0
+#define			APIC_DFR_CLUSTER	0x0FFFFFFFul	/* Clustered */
+#define			APIC_DFR_FLAT		0xFFFFFFFFul	/* Flat mode */
 #define		APIC_SPIV	0xF0
 #define			APIC_SPIV_FOCUS_DISABLED	(1<<9)
 #define			APIC_SPIV_APIC_ENABLED		(1<<8)
@@ -58,6 +62,7 @@
 #define			APIC_INT_ASSERT		0x04000
 #define			APIC_ICR_BUSY		0x01000
 #define			APIC_DEST_LOGICAL	0x00800
+#define				APIC_DEST_PHYSICAL	0x0	/* For symmetry */
 #define			APIC_DM_FIXED		0x00000
 #define			APIC_DM_LOWEST		0x00100
 #define			APIC_DM_SMI		0x00200
@@ -108,7 +113,13 @@
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 
-#define MAX_IO_APICS 8
+#define MAX_IO_APICS 32	/* Summit boxes can have 4*(2+3*2) I/O APICs */
+
+/*
+ * The intr broadcast ID is 0xF for old APICs and 0xFF for xAPICs.
+ */
+#define APIC_BROADCAST_ID_XAPIC	0xFF
+#define APIC_BROADCAST_ID_APIC	0x0F
 
 /*
  * the local APIC register structure, memory mapped. Not terribly well
diff -ruN 2.5.31/include/asm-i386/mpspec.h s31/include/asm-i386/mpspec.h
--- 2.5.31/include/asm-i386/mpspec.h	Sat Aug 10 18:41:16 2002
+++ s31/include/asm-i386/mpspec.h	Wed Aug 14 19:30:13 2002
@@ -14,13 +14,10 @@
 #define SMP_MAGIC_IDENT	(('_'<<24)|('P'<<16)|('M'<<8)|'_')
 
 /*
- * a maximum of 16 APICs with the current APIC ID architecture.
+ * A maximum of 16 APICs with the classic APIC ID architecture.
+ * xAPICs can have up to 256.
  */
-#ifdef CONFIG_MULTIQUAD
 #define MAX_APICS 256
-#else /* !CONFIG_MULTIQUAD */
-#define MAX_APICS 16
-#endif /* CONFIG_MULTIQUAD */
 
 #define MAX_MPC_ENTRY 1024
 
@@ -204,6 +201,7 @@
 extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
 
 extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int boot_cpu_logical_apicid;
 extern unsigned long phys_cpu_present_map;
 extern int smp_found_config;
 extern void find_smp_config (void);
diff -ruN 2.5.31/include/asm-i386/smp.h s31/include/asm-i386/smp.h
--- 2.5.31/include/asm-i386/smp.h	Sat Aug 10 18:41:18 2002
+++ s31/include/asm-i386/smp.h	Wed Aug 14 19:30:13 2002
@@ -19,33 +19,56 @@
 #include <asm/io_apic.h>
 #endif
 #include <asm/apic.h>
-#endif
-#endif
+#endif /* !__ASSEMBLY__ */
+#endif /* CONFIG_X86_LOCAL_APIC */
 
-#ifdef CONFIG_SMP
-# ifdef CONFIG_MULTIQUAD
-#  define TARGET_CPUS 0xf     /* all CPUs in *THIS* quad */
-#  define INT_DELIVERY_MODE 0     /* physical delivery on LOCAL quad */
-# else
-#  define TARGET_CPUS cpu_online_map
-#  define INT_DELIVERY_MODE 1     /* logical delivery broadcast to all procs 
*/
-# endif
-#else
-# define INT_DELIVERY_MODE 1     /* logical delivery */
-# define TARGET_CPUS 0x01
-#endif
+#ifndef __ASSEMBLY__
+extern u8 clustered_apic_mode;
+extern u8 esr_disable;
+extern u32 apic_broadcast_id;
+extern unsigned long logical_cpu_present_map;
+extern unsigned long phys_cpu_present_map;
+
+/*
+ * Some lowlevel functions might want to know about
+ * the real APIC ID <-> CPU # mapping.
+ */
+#define MAX_APICID 256
+#define BAD_APICID 0xFFu
+extern volatile u8 cpu_2_physical_apicid[NR_CPUS];
+extern volatile u8 physical_apicid_2_cpu[MAX_APICID];
+extern volatile u8 cpu_2_logical_apicid[NR_CPUS];
+extern volatile u8 logical_apicid_2_cpu[MAX_APICID];
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+
+#ifndef CONFIG_X86_LOCAL_APIC
+
+#define clustered_apic_mode	(0)
+#define esr_disable		(0)
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#endif /* !__ASSEMBLY__ */
+
+#define CLUSTERED_APIC_NUMAQ	0x01
+#define CLUSTERED_APIC_XAPIC	0x02
+
+#define clustered_apic_numaq	(clustered_apic_mode & CLUSTERED_APIC_NUMAQ)
+#define clustered_apic_xapic	(clustered_apic_mode & CLUSTERED_APIC_XAPIC)
+
+#define APIC_DEST_CPUS_MASK	0x0Fu	/* Destination masks for */
+#define APIC_DEST_CLUSTER_MASK	0xF0u	/* clustered mode. */
+#define INT_DEST_ADDR_MODE	1	/* logical delivery */
 
-#ifndef clustered_apic_mode
- #ifdef CONFIG_MULTIQUAD
-  #define clustered_apic_mode (1)
-  #define esr_disable (1)
- #else /* !CONFIG_MULTIQUAD */
-  #define clustered_apic_mode (0)
-  #define esr_disable (0)
- #endif /* CONFIG_MULTIQUAD */
-#endif 
 
 #ifdef CONFIG_SMP
+#define smp_processor_id() (current->processor)
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -53,7 +76,6 @@
  */
  
 extern void smp_alloc_memory(void);
-extern unsigned long phys_cpu_present_map;
 extern unsigned long cpu_online_map;
 extern volatile unsigned long smp_invalidate_needed;
 extern int pic_mode;
@@ -69,16 +91,6 @@
 extern void zap_low_mappings (void);
 
 /*
- * Some lowlevel functions might want to know about
- * the real APIC ID <-> CPU # mapping.
- */
-#define MAX_APICID 256
-extern volatile int cpu_to_physical_apicid[NR_CPUS];
-extern volatile int physical_apicid_to_cpu[MAX_APICID];
-extern volatile int cpu_to_logical_apicid[NR_CPUS];
-extern volatile int logical_apicid_to_cpu[MAX_APICID];
-
-/*
  * This function is needed by all SMP systems. It must _always_ be valid
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
@@ -123,7 +135,7 @@
 
 #endif /* !__ASSEMBLY__ */
 
-#define NO_PROC_ID		0xFF		/* No processor magic marker */
+#define NO_PROC_ID		0xFFu		/* No processor magic marker */
 
-#endif
-#endif
+#endif /* CONFIG_SMP */
+#endif /* __ASM_SMP_H */
diff -ruN 2.5.31/include/asm-i386/smpboot.h s31/include/asm-i386/smpboot.h
--- 2.5.31/include/asm-i386/smpboot.h	Sat Aug 10 18:41:55 2002
+++ s31/include/asm-i386/smpboot.h	Wed Aug 14 19:30:13 2002
@@ -1,62 +1,50 @@
 #ifndef __ASM_SMPBOOT_H
 #define __ASM_SMPBOOT_H
 
-#ifndef clustered_apic_mode
- #ifdef CONFIG_MULTIQUAD
-  #define clustered_apic_mode (1)
- #else /* !CONFIG_MULTIQUAD */
-  #define clustered_apic_mode (0)
- #endif /* CONFIG_MULTIQUAD */
-#endif 
- 
-#ifdef CONFIG_MULTIQUAD
- #define TRAMPOLINE_LOW phys_to_virt(0x8)
- #define TRAMPOLINE_HIGH phys_to_virt(0xa)
-#else /* !CONFIG_MULTIQUAD */
- #define TRAMPOLINE_LOW phys_to_virt(0x467)
- #define TRAMPOLINE_HIGH phys_to_virt(0x469)
-#endif /* CONFIG_MULTIQUAD */
-
-#ifdef CONFIG_MULTIQUAD
- #define boot_cpu_apicid boot_cpu_logical_apicid
-#else /* !CONFIG_MULTIQUAD */
- #define boot_cpu_apicid boot_cpu_physical_apicid
-#endif /* CONFIG_MULTIQUAD */
+#ifndef __ASM_SMP_H
+#include "asm/smp.h"
+#endif
+
+#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
+#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)
+
+//#define boot_cpu_apicid 
(clustered_apic_numaq?boot_cpu_logical_apicid:boot_cpu_physical_apicid)
+
+/*
+ * To build the logical APIC ID for each CPU we have three cases:
+ *  1) Normal flat mode:  use a bitmap of the CPU numbers
+ *  2) NUMA-Q:  do nothing, the BIOS has set it up
+ *  3) xAPIC:  convert the Intel standard physical APIC ID to a cluster
+ *	nibble/cpu bitmap nibble
+ */
+/* cpu index numbr:  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, ... */
+/* phys xAPIC IDs : 00, 01, 02, 03, 10, 11, 12, 13, 20, 21, 22, ... */
+/* logical APIC ID: 01, 02, 04, 08, 11, 12, 14, 18, 21, 22, 24, ... */
+#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) | 
((phys_apic) & APIC_DEST_CLUSTER_MASK))
 
 /*
- * How to map from the cpu_present_map
+ * How to map from phys_cpu_present_map.
+ *  1) Normal flat mode:  use the mps_cpu, apicid bitmap
+ *  2) Multi-Quad:  only 4 CPUs per cluster, cluster ID in high nibble
  */
-#ifdef CONFIG_MULTIQUAD
- #define cpu_present_to_apicid(mps_cpu) ( ((mps_cpu/4)*16) + (1<<(mps_cpu%4)) 
)
-#else /* !CONFIG_MULTIQUAD */
- #define cpu_present_to_apicid(apicid) (apicid)
-#endif /* CONFIG_MULTIQUAD */
+#if 1
+#define cpu_present_to_apicid(cpu)	(cpu_to_logical_apicid(cpu))
+#else
+#define cpu_present_to_apicid(mps_cpu)	(clustered_apic_numaq ? \
+ 	( (((u32)(mps_cpu) >> 2) << 4) + (1u << ((mps_cpu) & 0x3)) ) : \
+	 (clustered_apic_xapic ? cpu_to_logical_apicid(mps_cpu) : 1u << (mps_cpu) ) 
)
+#endif
+extern unsigned char raw_phys_apicid[NR_CPUS];
+#define apicid_to_phys_cpu_present(apicid)	(clustered_apic_mode ? (1ul << 
((((apicid) >> 4) << 2) | ((apicid) & 0x3))) : (1ul << (apicid)))
 
 /*
  * Mappings between logical cpu number and logical / physical apicid
- * The first four macros are trivial, but it keeps the abstraction consistent
  */
-extern volatile int logical_apicid_2_cpu[];
-extern volatile int cpu_2_logical_apicid[];
-extern volatile int physical_apicid_2_cpu[];
-extern volatile int cpu_2_physical_apicid[];
-
-#define logical_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_logical_apicid(cpu) cpu_2_logical_apicid[cpu]
-#define physical_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_physical_apicid(cpu) cpu_2_physical_apicid[cpu]
-#ifdef CONFIG_MULTIQUAD			/* use logical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_logical_apicid[cpu]
-#else /* !CONFIG_MULTIQUAD */		/* use physical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_physical_apicid[cpu]
-#endif /* CONFIG_MULTIQUAD */
-
-
-#ifdef CONFIG_MULTIQUAD
-#else /* !CONFIG_MULTIQUAD */
-#endif /* CONFIG_MULTIQUAD */
+extern volatile u8 cpu_2_logical_apicid[];
+extern volatile u8 cpu_2_physical_apicid[];
+
+#define cpu_to_logical_apicid(cpu)	(int)cpu_2_logical_apicid[cpu]
+#define cpu_to_physical_apicid(cpu)	(int)cpu_2_physical_apicid[cpu]
 
 
 #endif


-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-13 19:22               ` Linus Torvalds
  2002-08-13 20:04                 ` Martin J. Bligh
  2002-08-13 20:22                 ` Alan Cox
@ 2002-08-24 12:19                 ` Zwane Mwaikambo
  2002-08-27  1:23                   ` James Cleverdon
  2 siblings, 1 reply; 34+ messages in thread
From: Zwane Mwaikambo @ 2002-08-24 12:19 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Martin J. Bligh, linux-kernel, Matt Dobson

On Tue, 13 Aug 2002, Linus Torvalds wrote:

> Well, it makes performance _so_ much better on a P4 that it's not even 
> funny. It's basically a "P4 is unusable with SMP" without it.

Out of interest, does anyone have an idea of what Windows does?

	Zwane

-- 
function.linuxpower.ca



^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-24 12:19                 ` Zwane Mwaikambo
@ 2002-08-27  1:23                   ` James Cleverdon
  2002-08-27  7:46                     ` Zwane Mwaikambo
  0 siblings, 1 reply; 34+ messages in thread
From: James Cleverdon @ 2002-08-27  1:23 UTC (permalink / raw)
  To: Zwane Mwaikambo, Linus Torvalds
  Cc: Martin J. Bligh, linux-kernel, Matt Dobson

On Saturday 24 August 2002 05:19 am, Zwane Mwaikambo wrote:
> On Tue, 13 Aug 2002, Linus Torvalds wrote:
> > Well, it makes performance _so_ much better on a P4 that it's not even
> > funny. It's basically a "P4 is unusable with SMP" without it.
>
> Out of interest, does anyone have an idea of what Windows does?
>
> 	Zwane

I'm not allowed to report second hand rumors from the folks who assisted with 
the debug of the x440 HAL, so I won't mention that they may or may not be 
using clustered logical interrupts and adjusting priority with the TPR.

I can admit that I have no clue how they assign IRQs to APIC clusters, whether 
randomly or if they try some load balancing scheme.

-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH] NUMA-Q disable irqbalance
  2002-08-27  1:23                   ` James Cleverdon
@ 2002-08-27  7:46                     ` Zwane Mwaikambo
  0 siblings, 0 replies; 34+ messages in thread
From: Zwane Mwaikambo @ 2002-08-27  7:46 UTC (permalink / raw)
  To: James Cleverdon
  Cc: Linus Torvalds, Martin J. Bligh, linux-kernel, Matt Dobson

On Mon, 26 Aug 2002, James Cleverdon wrote:

> I'm not allowed to report second hand rumors from the folks who assisted with 
> the debug of the x440 HAL, so I won't mention that they may or may not be 
> using clustered logical interrupts and adjusting priority with the TPR.

Thanks anyway =)

> I can admit that I have no clue how they assign IRQs to APIC clusters, whether 
> randomly or if they try some load balancing scheme.

	Zwane
-- 
function.linuxpower.ca


^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2002-08-27  7:25 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-08-05 23:51 [PATCH] NUMA-Q disable irqbalance Martin J. Bligh
2002-08-13 16:13 ` Martin J. Bligh
2002-08-13 16:41   ` Linus Torvalds
2002-08-13 16:57     ` Alan Cox
2002-08-13 17:24       ` Martin J. Bligh
2002-08-13 17:38         ` Alan Cox
2002-08-13 17:14     ` Martin J. Bligh
2002-08-13 17:24       ` Linus Torvalds
2002-08-13 18:02         ` Martin J. Bligh
2002-08-13 18:20           ` Linus Torvalds
2002-08-13 18:58             ` Martin J. Bligh
2002-08-13 19:22               ` Linus Torvalds
2002-08-13 20:04                 ` Martin J. Bligh
2002-08-13 20:22                   ` Linus Torvalds
2002-08-14  5:52                     ` Martin J. Bligh
2002-08-14 10:10                     ` Jos Hulzink
2002-08-14 11:12                       ` David Lang
2002-08-13 20:22                 ` Alan Cox
2002-08-13 20:35                   ` Linus Torvalds
2002-08-13 20:34                     ` Alan Cox
2002-08-13 20:42                     ` Martin J. Bligh
2002-08-13 21:24                       ` Linus Torvalds
2002-08-13 22:29                         ` Andrew Theurer
2002-08-13 23:30                           ` Andrea Arcangeli
2002-08-14 21:16                             ` James Cleverdon
2002-08-23  2:31                             ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing James Cleverdon
2002-08-20  0:49                         ` [PATCH] NUMA-Q disable irqbalance Dave Hansen
2002-08-13 22:08                     ` Rik van Riel
2002-08-13 22:14                       ` Rik van Riel
2002-08-14 14:49                         ` Linus Torvalds
2002-08-14 15:19                           ` Rik van Riel
2002-08-24 12:19                 ` Zwane Mwaikambo
2002-08-27  1:23                   ` James Cleverdon
2002-08-27  7:46                     ` Zwane Mwaikambo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox