public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-07-26 16:03 ` Andi Kleen
@ 2005-08-04  7:05   ` James Cleverdon
  2005-08-04  9:22     ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: James Cleverdon @ 2005-08-04  7:05 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, Protasevich, Natalie

[-- Attachment #1: Type: text/plain, Size: 1012 bytes --]

Due to some device driver issues, I built this iteration of the patch 
vs. 2.6.12.3.

(Sorry about the attachment, but KMail is still word wrapping inserted 
files.)

Background:

Here's a patch that builds on Natalie Protasevich's IRQ compression 
patch and tries to work for MPS boots as well as ACPI.  It is meant for 
a 4-node IBM x460 NUMA box, which was dying because it had interrupt 
pins with GSI numbers > NR_IRQS and thus overflowed irq_desc.

The problem is that this system has 280 GSIs (which are 1:1 mapped with 
I/O APIC RTEs) and an 8-node box would have 560.  This is much bigger 
than NR_IRQS (224 for both i386 and x86_64).  Also, there aren't enough 
vectors to go around.  There are about 190 usable vectors, not counting 
the reserved ones and the unused vectors at 0x20 to 0x2F.  So, my patch 
attempts to compress the GSI range and share vectors by sharing IRQs.


-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

[-- Attachment #2: vect_share_irq_2005-08-03c_2.6.12.3 --]
[-- Type: text/x-diff, Size: 5920 bytes --]

diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c n12.3/arch/i386/kernel/acpi/boot.c
--- 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000 -0700
+++ n12.3/arch/i386/kernel/acpi/boot.c	2005-08-04 00:01:10.199710211 -0700
@@ -42,6 +42,7 @@
 static inline void  acpi_madt_oem_check(char *oem_id, char *oem_table_id) { }
 extern void __init clustered_apic_check(void);
 static inline int ioapic_setup_disabled(void) { return 0; }
+extern int gsi_irq_sharing(int gsi);
 #include <asm/proto.h>
 
 #else	/* X86 */
@@ -51,6 +52,9 @@ static inline int ioapic_setup_disabled(
 #include <mach_mpparse.h>
 #endif	/* CONFIG_X86_LOCAL_APIC */
 
+static inline int gsi_irq_sharing(int gsi) { return gsi; }
+
+
 #endif	/* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (					    \
@@ -453,7 +457,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned in
  		*irq = IO_APIC_VECTOR(gsi);
 	else
 #endif
-		*irq = gsi;
+		*irq = gsi_irq_sharing(gsi);
 	return 0;
 }
 
diff -pruN 2.6.12.3/arch/x86_64/Kconfig n12.3/arch/x86_64/Kconfig
--- 2.6.12.3/arch/x86_64/Kconfig	2005-07-15 14:18:57.000000000 -0700
+++ n12.3/arch/x86_64/Kconfig	2005-08-03 21:31:07.487451167 -0700
@@ -280,13 +280,13 @@ config HAVE_DEC_LOCK
 	default y
 
 config NR_CPUS
-	int "Maximum number of CPUs (2-256)"
-	range 2 256
+	int "Maximum number of CPUs (2-255)"
+	range 2 255
 	depends on SMP
-	default "8"
+	default "16"
 	help
 	  This allows you to specify the maximum number of CPUs which this
-	  kernel will support. Current maximum is 256 CPUs due to
+	  kernel will support. Current maximum is 255 CPUs due to
 	  APIC addressing limits. Less depending on the hardware.
 
 	  This is purely to save memory - each supported CPU requires
diff -pruN 2.6.12.3/arch/x86_64/kernel/io_apic.c n12.3/arch/x86_64/kernel/io_apic.c
--- 2.6.12.3/arch/x86_64/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ n12.3/arch/x86_64/kernel/io_apic.c	2005-08-03 21:31:07.488451039 -0700
@@ -56,7 +56,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -182,6 +182,8 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
  * specific CPU-side IRQs.
@@ -581,6 +583,64 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+static int next_irq = 16;
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, irq, vector;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+ retry_vector:
+	vector = assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector) {
+			if (!platform_legacy_irq(i))
+				break;			/* got one */
+			IO_APIC_VECTOR(gsi) = 0;
+			goto retry_vector;
+		}
+	if (i < NR_IRQS) {
+		irq = i;
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, irq);
+		return irq;
+	}
+
+	irq = next_irq++;
+	BUG_ON(irq >= NR_IRQS);
+	gsi_2_irq[gsi] = irq;
+	IO_APIC_VECTOR(irq) = vector;
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, irq);
+
+	return irq;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -610,6 +670,7 @@ static int pin_2_irq(int idx, int apic, 
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -670,9 +731,8 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
@@ -1866,6 +1926,7 @@ int io_apic_set_pci_routing (int ioapic,
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff -pruN 2.6.12.3/arch/x86_64/kernel/mpparse.c n12.3/arch/x86_64/kernel/mpparse.c
--- 2.6.12.3/arch/x86_64/kernel/mpparse.c	2005-07-15 14:18:57.000000000 -0700
+++ n12.3/arch/x86_64/kernel/mpparse.c	2005-08-03 21:31:07.489450912 -0700
@@ -214,7 +214,7 @@ static void __init MP_intsrc_info (struc
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff -pruN 2.6.12.3/include/asm-x86_64/mpspec.h n12.3/include/asm-x86_64/mpspec.h
--- 2.6.12.3/include/asm-x86_64/mpspec.h	2005-07-15 14:18:57.000000000 -0700
+++ n12.3/include/asm-x86_64/mpspec.h	2005-08-03 21:31:07.489450912 -0700
@@ -157,7 +157,8 @@ struct mpc_config_lintsrc
  */
 
 #define MAX_MP_BUSSES 256
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-04  7:05   ` [RFC][2.6.12.3] " James Cleverdon
@ 2005-08-04  9:22     ` Andi Kleen
  2005-08-15  2:57       ` James Cleverdon
  0 siblings, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2005-08-04  9:22 UTC (permalink / raw)
  To: James Cleverdon; +Cc: Andi Kleen, linux-kernel, Protasevich, Natalie

On Thu, Aug 04, 2005 at 12:05:50AM -0700, James Cleverdon wrote:

> diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c n12.3/arch/i386/kernel/acpi/boot.c
> --- 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000 -0700
> +++ n12.3/arch/i386/kernel/acpi/boot.c	2005-08-04 00:01:10.199710211 -0700
> @@ -42,6 +42,7 @@
>  static inline void  acpi_madt_oem_check(char *oem_id, char *oem_table_id) { }
>  extern void __init clustered_apic_check(void);
>  static inline int ioapic_setup_disabled(void) { return 0; }
> +extern int gsi_irq_sharing(int gsi);
>  #include <asm/proto.h>
>  
>  #else	/* X86 */
> @@ -51,6 +52,9 @@ static inline int ioapic_setup_disabled(
>  #include <mach_mpparse.h>
>  #endif	/* CONFIG_X86_LOCAL_APIC */
>  
> +static inline int gsi_irq_sharing(int gsi) { return gsi; }

Why is this different for i386/x86-64? It shouldn't.

As a unrelated note we really need to get rid of this whole ifdef block.

> +++ n12.3/arch/x86_64/Kconfig	2005-08-03 21:31:07.487451167 -0700
> @@ -280,13 +280,13 @@ config HAVE_DEC_LOCK
>  	default y
>  
>  config NR_CPUS
> -	int "Maximum number of CPUs (2-256)"
> -	range 2 256
> +	int "Maximum number of CPUs (2-255)"
> +	range 2 255
>  	depends on SMP
> -	default "8"
> +	default "16"

Don't change the default please.

> +static int next_irq = 16;

Won't this need a lock for hotplug later?

> +
> + retry_vector:
> +	vector = assign_irq_vector(gsi);
> +
> +	/*
> +	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
> +	 * use of vector and if found, return that IRQ.  However, we never want
> +	 * to share legacy IRQs, which usually have a different trigger mode
> +	 * than PCI.
> +	 */

Can we perhaps force such sharing early temporarily even when the table
is not filled up?  This way we would get better test coverage of all
of  this.

That would be later disabled of course.

Rest looks ok to me.

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
@ 2005-08-10 21:03 Protasevich, Natalie
  2005-08-10 23:55 ` James Cleverdon
  2005-08-11 17:52 ` Zwane Mwaikambo
  0 siblings, 2 replies; 21+ messages in thread
From: Protasevich, Natalie @ 2005-08-10 21:03 UTC (permalink / raw)
  To: jamesclv, Andi Kleen; +Cc: Brown, Len, zwane, linux-kernel

> Due to some device driver issues, I built this iteration of 
> the patch vs. 2.6.12.3.
> 
> (Sorry about the attachment, but KMail is still word wrapping inserted
> files.)
> 
> Background:
> 
> Here's a patch that builds on Natalie Protasevich's IRQ 
> compression patch and tries to work for MPS boots as well as 
> ACPI.  It is meant for a 4-node IBM x460 NUMA box, which was 
> dying because it had interrupt pins with GSI numbers > 
> NR_IRQS and thus overflowed irq_desc.
> 
> The problem is that this system has 280 GSIs (which are 1:1 
> mapped with I/O APIC RTEs) and an 8-node box would have 560.  
> This is much bigger than NR_IRQS (224 for both i386 and 
> x86_64).  Also, there aren't enough vectors to go around.  
> There are about 190 usable vectors, not counting the reserved 
> ones and the unused vectors at 0x20 to 0x2F.  So, my patch 
> attempts to compress the GSI range and share vectors by sharing IRQs.
> 
Hi James, 
I tested your patch today (sorry it took a while, was out of town), and
in general it worked just fine. It was a small system with 3 IO-APICs,
will hopefully try it on a large partition with 64 of them tonight.
One thing I noticed: I think the patch is going for shared vectors way
before exhausting available NR_IRQS, so I suggest a small modification
to it, in gsi_irq_sharing():
int gsi_irq_sharing(int gsi)
{
        int i, irq, vector;

        BUG_ON(gsi >= NR_IRQ_VECTORS);

        if (platform_legacy_irq(gsi)) {
                gsi_2_irq[gsi] = gsi;
                return gsi;
        }

        if (gsi_2_irq[gsi] != 0xFF)
                return (int)gsi_2_irq[gsi];

        vector = assign_irq_vector(gsi);
// this part here==========
        if (gsi < 16) {
                irq = gsi;
                gsi_2_irq[gsi] = irq;
        } else {
                irq = next_irq++;
                gsi_2_irq[gsi] = irq;
        }
//====================
        IO_APIC_VECTOR(irq) = vector;
        printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
                        gsi, vector, irq);

        return irq;
}

(I took out the vector sharing part for clarity, just to concentrate on
compression, and I didn't do any boundary checks). The (gsi<16) takes
care of the recent problem with my ACPI IRQ compression patch breaking
VIA chipset that doesn't tolerate PCI IRQ numbers above 15.

I think this way we are saving more IRQs and place them denser.
Here is back-to-back comparison of IRQ distribution with the original
and modified patch:

Original:
           CPU0       CPU1       CPU2       CPU3
  0:      18758      20011      20008      28294    IO-APIC-edge  timer
  1:         97         18         79         16    IO-APIC-edge  i8042
  2:          0          0          0          0          XT-PIC
cascade
  8:          1          0          0          1    IO-APIC-edge  rtc
  9:          0          0          0          0    IO-APIC-edge  acpi
 12:          0        708          0        110    IO-APIC-edge  i8042
 15:          4          0          0         39    IO-APIC-edge  ide1
 16:          0          0          0          0   IO-APIC-level
uhci_hcd:usb1, uhci_hcd:usb4
 17:          0          0          0          3   IO-APIC-level
ohci1394
 18:        670       2253        836       1981   IO-APIC-level
libata, uhci_hcd:usb3
 19:          0          0          0          0   IO-APIC-level
uhci_hcd:usb2
 23:          0          0          0          0   IO-APIC-level
ehci_hcd:usb5
 48:        212          0          0          4   IO-APIC-level  eth0
<== gap on the 3nd io-apic
NMI:        117         71         73         51
LOC:      87020      86997      86975      86952
ERR:          3
MIS:          0

<7>IRQ to pin mappings:
<7>IRQ0 -> 0:2
<7>IRQ1 -> 0:1
<7>IRQ3 -> 0:3
<7>IRQ4 -> 0:4
<7>IRQ5 -> 0:5
<7>IRQ6 -> 0:6
<7>IRQ7 -> 0:7
<7>IRQ8 -> 0:8
<7>IRQ9 -> 0:9
<7>IRQ10 -> 0:10
<7>IRQ11 -> 0:11
<7>IRQ12 -> 0:12
<7>IRQ14 -> 0:14
<7>IRQ15 -> 0:15
<7>IRQ16 -> 0:16
<7>IRQ17 -> 0:17
<7>IRQ18 -> 0:18
<7>IRQ19 -> 0:19
<7>IRQ20 -> 0:20
<7>IRQ23 -> 0:23
<7>IRQ26 -> 1:2 <=======jump on the 2nd io-apic
<7>IRQ27 -> 1:3
<7>IRQ28 -> 1:4
<7>IRQ29 -> 1:5
<7>IRQ48 -> 2:0 <=======jump on the 3rd io-apic
<7>IRQ49 -> 2:1
<7>IRQ50 -> 2:2
<7>IRQ51 -> 2:3
<7>IRQ52 -> 2:4
<7>IRQ53 -> 2:5
<7>IRQ54 -> 2:6
<7>IRQ55 -> 2:7
<7>IRQ56 -> 2:8

Modified:
           CPU0       CPU1       CPU2       CPU3
  0:      15125      17509      17507      25592    IO-APIC-edge  timer
  1:        187         66        280        140    IO-APIC-edge  i8042
  2:          0          0          0          0          XT-PIC
cascade
  8:          1          0          0          1    IO-APIC-edge  rtc
  9:          0          0          0          0    IO-APIC-edge  acpi
 12:          0          0          0        110    IO-APIC-edge  i8042
 15:          4          0          0         39    IO-APIC-edge  ide1
 16:          0          0          0          0   IO-APIC-level
uhci_hcd:usb1, uhci_hcd:usb4
 17:          0          0          0          2   IO-APIC-level
ohci1394
 18:        753       2070        925       2035   IO-APIC-level
libata, uhci_hcd:usb3
 19:          0          0          0          0   IO-APIC-level
uhci_hcd:usb2
 21:          0          0          0          0   IO-APIC-level
ehci_hcd:usb5
 26:        164          0          0          4   IO-APIC-level  eth0
NMI:        117         72         73         52
LOC:      75682      75659      75638      75615
ERR:          3
MIS:          0

<7>IRQ to pin mappings:
<7>IRQ0 -> 0:2
<7>IRQ1 -> 0:1
<7>IRQ3 -> 0:3
<7>IRQ4 -> 0:4
<7>IRQ5 -> 0:5
<7>IRQ6 -> 0:6
<7>IRQ7 -> 0:7
<7>IRQ8 -> 0:8
<7>IRQ9 -> 0:9
<7>IRQ10 -> 0:10
<7>IRQ11 -> 0:11
<7>IRQ12 -> 0:12
<7>IRQ14 -> 0:14
<7>IRQ15 -> 0:15
<7>IRQ16 -> 0:16
<7>IRQ17 -> 0:17
<7>IRQ18 -> 0:18
<7>IRQ19 -> 0:19
<7>IRQ20 -> 0:20
<7>IRQ21 -> 0:23
<7>IRQ22 -> 1:2
<7>IRQ23 -> 1:3
<7>IRQ24 -> 1:4
<7>IRQ25 -> 1:5
<7>IRQ26 -> 2:0
<7>IRQ27 -> 2:1
<7>IRQ28 -> 2:2
<7>IRQ29 -> 2:3
<7>IRQ30 -> 2:4
<7>IRQ31 -> 2:5
<7>IRQ32 -> 2:6
<7>IRQ33 -> 2:7
<7>IRQ34 -> 2:8

Unfortunately, I cannot test the vector sharing part properly, since on
our systems we are just about to use up all 224 interrupts, but not
quiet. 
I have to mention that as far as I know Zwane is about to release his
vector sharing mechanism, he had it implemented and working for i386 (I
tested it on ES7000 successfully, by itself and combined with
compression patch too), and was planning implementing it for x86_64. I
am officially volunteering for testing it in its present state, for both
i386 and x86_64 (I can still do this on our systems by removing the IRQ
compression code :), hope this will help Zwane and Andi to release it as
soon as possible.

Regards,
--Natalie

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-10 21:03 Protasevich, Natalie
@ 2005-08-10 23:55 ` James Cleverdon
  2005-08-11 17:52 ` Zwane Mwaikambo
  1 sibling, 0 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-10 23:55 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: Andi Kleen, Brown, Len, zwane, linux-kernel

Comments below.

On Wednesday 10 August 2005 02:03 pm, Protasevich, Natalie wrote:
> > Due to some device driver issues, I built this iteration of
> > the patch vs. 2.6.12.3.
> >
> > (Sorry about the attachment, but KMail is still word wrapping
> > inserted files.)
> >
> > Background:
> >
> > Here's a patch that builds on Natalie Protasevich's IRQ
> > compression patch and tries to work for MPS boots as well as
> > ACPI.  It is meant for a 4-node IBM x460 NUMA box, which was
> > dying because it had interrupt pins with GSI numbers >
> > NR_IRQS and thus overflowed irq_desc.
> >
> > The problem is that this system has 280 GSIs (which are 1:1
> > mapped with I/O APIC RTEs) and an 8-node box would have 560.
> > This is much bigger than NR_IRQS (224 for both i386 and
> > x86_64).  Also, there aren't enough vectors to go around.
> > There are about 190 usable vectors, not counting the reserved
> > ones and the unused vectors at 0x20 to 0x2F.  So, my patch
> > attempts to compress the GSI range and share vectors by sharing
> > IRQs.
>
> Hi James,
> I tested your patch today (sorry it took a while, was out of town),
> and in general it worked just fine. It was a small system with 3
> IO-APICs, will hopefully try it on a large partition with 64 of them
> tonight. One thing I noticed: I think the patch is going for shared
> vectors way before exhausting available NR_IRQS, so I suggest a small
> modification to it, in gsi_irq_sharing():
> int gsi_irq_sharing(int gsi)
> {
>         int i, irq, vector;
>
>         BUG_ON(gsi >= NR_IRQ_VECTORS);
>
>         if (platform_legacy_irq(gsi)) {
>                 gsi_2_irq[gsi] = gsi;
>                 return gsi;
>         }
>
>         if (gsi_2_irq[gsi] != 0xFF)
>                 return (int)gsi_2_irq[gsi];
>
>         vector = assign_irq_vector(gsi);
> // this part here==========

I thought I had this case covered earlier, given that in both i386 and 
x86_64:

#define platform_legacy_irq(irq)      ((irq) < 16)

In the deleted vector sharing code, I also check platform_legacy_irq, to 
avoid inadvertently sharing vectors already assigned to legacy IRQs.

Am I missing your point here?

>         if (gsi < 16) {
>                 irq = gsi;
>                 gsi_2_irq[gsi] = irq;
>         } else {
>                 irq = next_irq++;
>                 gsi_2_irq[gsi] = irq;
>         }
> //====================
>         IO_APIC_VECTOR(irq) = vector;
>         printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ
> %d\n", gsi, vector, irq);
>
>         return irq;
> }
>
> (I took out the vector sharing part for clarity, just to concentrate
> on compression, and I didn't do any boundary checks). The (gsi<16)
> takes care of the recent problem with my ACPI IRQ compression patch
> breaking VIA chipset that doesn't tolerate PCI IRQ numbers above 15.
>
> I think this way we are saving more IRQs and place them denser.
> Here is back-to-back comparison of IRQ distribution with the original
> and modified patch:
>
> Original:
>            CPU0       CPU1       CPU2       CPU3
>   0:      18758      20011      20008      28294    IO-APIC-edge 
> timer 1:         97         18         79         16    IO-APIC-edge 
> i8042 2:          0          0          0          0          XT-PIC
> cascade
>   8:          1          0          0          1    IO-APIC-edge  rtc
>   9:          0          0          0          0    IO-APIC-edge 
> acpi 12:          0        708          0        110    IO-APIC-edge 
> i8042 15:          4          0          0         39    IO-APIC-edge
>  ide1 16:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb1, uhci_hcd:usb4
>  17:          0          0          0          3   IO-APIC-level
> ohci1394
>  18:        670       2253        836       1981   IO-APIC-level
> libata, uhci_hcd:usb3
>  19:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb2
>  23:          0          0          0          0   IO-APIC-level
> ehci_hcd:usb5
>  48:        212          0          0          4   IO-APIC-level 
> eth0 <== gap on the 3nd io-apic
> NMI:        117         71         73         51
> LOC:      87020      86997      86975      86952
> ERR:          3
> MIS:          0
>
> <7>IRQ to pin mappings:
> <7>IRQ0 -> 0:2
> <7>IRQ1 -> 0:1
> <7>IRQ3 -> 0:3
> <7>IRQ4 -> 0:4
> <7>IRQ5 -> 0:5
> <7>IRQ6 -> 0:6
> <7>IRQ7 -> 0:7
> <7>IRQ8 -> 0:8
> <7>IRQ9 -> 0:9
> <7>IRQ10 -> 0:10
> <7>IRQ11 -> 0:11
> <7>IRQ12 -> 0:12
> <7>IRQ14 -> 0:14
> <7>IRQ15 -> 0:15
> <7>IRQ16 -> 0:16
> <7>IRQ17 -> 0:17
> <7>IRQ18 -> 0:18
> <7>IRQ19 -> 0:19
> <7>IRQ20 -> 0:20
> <7>IRQ23 -> 0:23
> <7>IRQ26 -> 1:2 <=======jump on the 2nd io-apic
> <7>IRQ27 -> 1:3
> <7>IRQ28 -> 1:4
> <7>IRQ29 -> 1:5
> <7>IRQ48 -> 2:0 <=======jump on the 3rd io-apic
> <7>IRQ49 -> 2:1
> <7>IRQ50 -> 2:2
> <7>IRQ51 -> 2:3
> <7>IRQ52 -> 2:4
> <7>IRQ53 -> 2:5
> <7>IRQ54 -> 2:6
> <7>IRQ55 -> 2:7
> <7>IRQ56 -> 2:8
>
> Modified:
>            CPU0       CPU1       CPU2       CPU3
>   0:      15125      17509      17507      25592    IO-APIC-edge 
> timer 1:        187         66        280        140    IO-APIC-edge 
> i8042 2:          0          0          0          0          XT-PIC
> cascade
>   8:          1          0          0          1    IO-APIC-edge  rtc
>   9:          0          0          0          0    IO-APIC-edge 
> acpi 12:          0          0          0        110    IO-APIC-edge 
> i8042 15:          4          0          0         39    IO-APIC-edge
>  ide1 16:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb1, uhci_hcd:usb4
>  17:          0          0          0          2   IO-APIC-level
> ohci1394
>  18:        753       2070        925       2035   IO-APIC-level
> libata, uhci_hcd:usb3
>  19:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb2
>  21:          0          0          0          0   IO-APIC-level
> ehci_hcd:usb5
>  26:        164          0          0          4   IO-APIC-level 
> eth0 NMI:        117         72         73         52
> LOC:      75682      75659      75638      75615
> ERR:          3
> MIS:          0
>
> <7>IRQ to pin mappings:
> <7>IRQ0 -> 0:2
> <7>IRQ1 -> 0:1
> <7>IRQ3 -> 0:3
> <7>IRQ4 -> 0:4
> <7>IRQ5 -> 0:5
> <7>IRQ6 -> 0:6
> <7>IRQ7 -> 0:7
> <7>IRQ8 -> 0:8
> <7>IRQ9 -> 0:9
> <7>IRQ10 -> 0:10
> <7>IRQ11 -> 0:11
> <7>IRQ12 -> 0:12
> <7>IRQ14 -> 0:14
> <7>IRQ15 -> 0:15
> <7>IRQ16 -> 0:16
> <7>IRQ17 -> 0:17
> <7>IRQ18 -> 0:18
> <7>IRQ19 -> 0:19
> <7>IRQ20 -> 0:20
> <7>IRQ21 -> 0:23
> <7>IRQ22 -> 1:2
> <7>IRQ23 -> 1:3
> <7>IRQ24 -> 1:4
> <7>IRQ25 -> 1:5
> <7>IRQ26 -> 2:0
> <7>IRQ27 -> 2:1
> <7>IRQ28 -> 2:2
> <7>IRQ29 -> 2:3
> <7>IRQ30 -> 2:4
> <7>IRQ31 -> 2:5
> <7>IRQ32 -> 2:6
> <7>IRQ33 -> 2:7
> <7>IRQ34 -> 2:8
>
> Unfortunately, I cannot test the vector sharing part properly, since
> on our systems we are just about to use up all 224 interrupts, but
> not quiet.

Since there are some reserved vectors and we aren't using 0x20-0x2F at 
all, there are only around 190 vectors available.  So, you should be 
able to test the vector sharing code.

> I have to mention that as far as I know Zwane is about to release his
> vector sharing mechanism, he had it implemented and working for i386
> (I tested it on ES7000 successfully, by itself and combined with
> compression patch too), and was planning implementing it for x86_64.
> I am officially volunteering for testing it in its present state, for
> both i386 and x86_64 (I can still do this on our systems by removing
> the IRQ compression code :), hope this will help Zwane and Andi to
> release it as soon as possible.
>
> Regards,
> --Natalie

I can't explain the gaps in the numbers with the original version.  I'll 
give your variant a try.

-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
@ 2005-08-11  0:21 Protasevich, Natalie
  2005-08-11  3:14 ` James Cleverdon
  0 siblings, 1 reply; 21+ messages in thread
From: Protasevich, Natalie @ 2005-08-11  0:21 UTC (permalink / raw)
  To: jamesclv; +Cc: Andi Kleen, Brown, Len, zwane, linux-kernel

> > int gsi_irq_sharing(int gsi)
> > {
> >         int i, irq, vector;
> >
> >         BUG_ON(gsi >= NR_IRQ_VECTORS);
> >
> >         if (platform_legacy_irq(gsi)) {
> >                 gsi_2_irq[gsi] = gsi;
> >                 return gsi;
> >         }
> >
> >         if (gsi_2_irq[gsi] != 0xFF)
> >                 return (int)gsi_2_irq[gsi];
> >
> >         vector = assign_irq_vector(gsi); // this part here==========
> 
> I thought I had this case covered earlier, given that in both i386 and
> x86_64:
> 
> #define platform_legacy_irq(irq)      ((irq) < 16)

Yes, you are absolutely correct, I don't need the (gsi<16) part, this
takes care of PCI IRQs that happened to be <16.

> In the deleted vector sharing code, I also check 
> platform_legacy_irq, to avoid inadvertently sharing vectors 
> already assigned to legacy IRQs.
> 
> Am I missing your point here?
> 
> >         if (gsi < 16) {
> >                 irq = gsi;
> >                 gsi_2_irq[gsi] = irq;
> >         } else {
> >                 irq = next_irq++;
> >                 gsi_2_irq[gsi] = irq;
> >         }
> > //====================
> 
> I can't explain the gaps in the numbers with the original 
> version.  I'll give your variant a try.

The only problem is here:

+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ
%d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	i = next_irq++;
 
That means for any IRQ < NR_IRQS you allow it to be identity mapped,
with all the gaps, and only for ones exceeding 224 you'll assign
consecutive next_irqs++, whereas you can do it for all PCI IRQs above
15. So, the alternative clause should probably come down to just:
                 irq = next_irq++;
                 gsi_2_irq[gsi] = irq; - which means just removing the
one above...
(although we better test that :)...I will definitely test vector sharing
when manage to get on max configuration partition here.

Regards,
--Natalie
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-11  0:21 [RFC][2.6.12.3] IRQ compression/sharing patch Protasevich, Natalie
@ 2005-08-11  3:14 ` James Cleverdon
  0 siblings, 0 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-11  3:14 UTC (permalink / raw)
  To: Protasevich, Natalie
  Cc: Andi Kleen, Brown, Len, Russ Weight, linux-kernel, zwane

On Wednesday 10 August 2005 05:21 pm, Protasevich, Natalie wrote:
> > > int gsi_irq_sharing(int gsi)
> > > {
> > >         int i, irq, vector;
> > >
> > >         BUG_ON(gsi >= NR_IRQ_VECTORS);
> > >
> > >         if (platform_legacy_irq(gsi)) {
> > >                 gsi_2_irq[gsi] = gsi;
> > >                 return gsi;
> > >         }
> > >
> > >         if (gsi_2_irq[gsi] != 0xFF)
> > >                 return (int)gsi_2_irq[gsi];
> > >
> > >         vector = assign_irq_vector(gsi); // this part
> > > here==========
> >
> > I thought I had this case covered earlier, given that in both i386
> > and x86_64:
> >
> > #define platform_legacy_irq(irq)      ((irq) < 16)
>
> Yes, you are absolutely correct, I don't need the (gsi<16) part, this
> takes care of PCI IRQs that happened to be <16.
>
> > In the deleted vector sharing code, I also check
> > platform_legacy_irq, to avoid inadvertently sharing vectors
> > already assigned to legacy IRQs.
> >
> > Am I missing your point here?
> >
> > >         if (gsi < 16) {
> > >                 irq = gsi;
> > >                 gsi_2_irq[gsi] = irq;
> > >         } else {
> > >                 irq = next_irq++;
> > >                 gsi_2_irq[gsi] = irq;
> > >         }
> > > //====================
> >
> > I can't explain the gaps in the numbers with the original
> > version.  I'll give your variant a try.
>
> The only problem is here:
>
> +	if (i < NR_IRQS) {
> +		gsi_2_irq[gsi] = i;
> +		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ
> %d\n",
> +				gsi, vector, i);
> +		return i;
> +	}
> +
> +	i = next_irq++;
>
> That means for any IRQ < NR_IRQS you allow it to be identity mapped,
> with all the gaps, and only for ones exceeding 224 you'll assign
> consecutive next_irqs++, whereas you can do it for all PCI IRQs above
> 15. So, the alternative clause should probably come down to just:
>                  irq = next_irq++;
>                  gsi_2_irq[gsi] = irq; - which means just removing
> the one above...
> (although we better test that :)...I will definitely test vector
> sharing when manage to get on max configuration partition here.
>
> Regards,
> --Natalie

Blast.  You're right, because i is the results of scanning irq_vector 
(AKA IO_APIC_VECTOR()) for a previous use of the vector.  If i < 
NR_IRQS then it has found a previous use and would substitute the other 
IRQ number rather than allocate a new one.

Of course, one side effect of calling assign_irq_vector(gsi) is that the 
vector number is stored at position gsi in irq_vector.  So, for IRQs 0 
to NR_IRQS - 1, it will always find itself.

OK, how about not fighting that side effect and only reassigning IRQs
that are >= NR_IRQS?  Maybe something like this:

@@ -579,4 +589,57 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can easily reach 800.
+ *
+ * Compact the sparse GSI space into a available IRQs and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	/*
+	 * Don't share legacy IRQs.  Their trigger modes are usually edge
+	 * and PCI is level.  Mixed modes are trouble.  Only big boxes are
+	 * likely to overflow IRQs or to share vectors.
+	 */
+	if (likely(gsi < NR_IRQS && !sharing_vectors) || platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return gsi_2_irq[gsi];
+	/*
+	 * Ran out of vectors or IRQ >= NR_IRQS.  Sharing vectors
+	 * means sharing IRQs, so scan irq_vectors for previous use
+	 * of vector and return that IRQ.
+	 */
+	tries = NR_IRQS;
+  try_again:
+	vector = assign_irq_vector(gsi);
+
+	/* Find the first IRQ using vector. */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+
+	if (i >= NR_IRQS || platform_legacy_irq(i)) {
+		if (--tries >= 0) {
+			IO_APIC_VECTOR(gsi) = 0;
+			goto try_again;
+		}
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	printk("GSI %d assigned vector 0x%02X and IRQ %d\n", gsi, vector, i);
+	gsi_2_irq[gsi] = i;
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;


-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
@ 2005-08-11 13:15 Protasevich, Natalie
  2005-08-11 17:24 ` James Cleverdon
  0 siblings, 1 reply; 21+ messages in thread
From: Protasevich, Natalie @ 2005-08-11 13:15 UTC (permalink / raw)
  To: jamesclv; +Cc: Andi Kleen, Russ Weight, linux-kernel

> > The only problem is here:
> >
> > +	if (i < NR_IRQS) {
> > +		gsi_2_irq[gsi] = i;
> > +		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ
> > %d\n",
> > +				gsi, vector, i);
> > +		return i;
> > +	}
> > +
> > +	i = next_irq++;
> >
> > That means for any IRQ < NR_IRQS you allow it to be 
> identity mapped, 
> > with all the gaps, and only for ones exceeding 224 you'll assign 
> > consecutive next_irqs++, whereas you can do it for all PCI 
> IRQs above 
> > 15. So, the alternative clause should probably come down to just:
> >                  irq = next_irq++;
> >                  gsi_2_irq[gsi] = irq; - which means just 
> removing the 
> > one above...
> > (although we better test that :)...I will definitely test vector 
> > sharing when manage to get on max configuration partition here.
> >
> > Regards,
> > --Natalie
> 
> Blast.  You're right, because i is the results of scanning 
> irq_vector (AKA IO_APIC_VECTOR()) for a previous use of the 
> vector.  If i < NR_IRQS then it has found a previous use and 
> would substitute the other IRQ number rather than allocate a new one.
> 
> Of course, one side effect of calling assign_irq_vector(gsi) 
> is that the vector number is stored at position gsi in 
> irq_vector.  So, for IRQs 0 to NR_IRQS - 1, it will always 
> find itself.
> 
> OK, how about not fighting that side effect and only 
> reassigning IRQs that are >= NR_IRQS?  Maybe something like this:

I am wondering where sharing_vectors gets set, it was not in the patch
originally.

> @@ -579,4 +589,57 @@ static inline int irq_trigger(int idx)
>  	return MPBIOS_trigger(idx);
>  }
>  
> +/*
> + * gsi_irq_sharing -- Name overload!  "irq" can be either a 
> legacy IRQ
> + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
> + * from ACPI, which can easily reach 800.
> + *
> + * Compact the sparse GSI space into a available IRQs and reuse
> + * vectors if possible.
> + */
> +int gsi_irq_sharing(int gsi)
> +{
> +	int i, tries, vector;
> +
> +	BUG_ON(gsi >= NR_IRQ_VECTORS);
> +
> +	/*
> +	 * Don't share legacy IRQs.  Their trigger modes are 
> usually edge
> +	 * and PCI is level.  Mixed modes are trouble.  Only 
> big boxes are
> +	 * likely to overflow IRQs or to share vectors.
> +	 */
> +	if (likely(gsi < NR_IRQS && !sharing_vectors) || 
> platform_legacy_irq(gsi)) {
> +		gsi_2_irq[gsi] = gsi;
> +		return gsi;
> +	}
> +
> +	if (gsi_2_irq[gsi] != 0xFF)
> +		return gsi_2_irq[gsi];
> +	/*
> +	 * Ran out of vectors or IRQ >= NR_IRQS.  Sharing vectors
> +	 * means sharing IRQs, so scan irq_vectors for previous use
> +	 * of vector and return that IRQ.
> +	 */
> +	tries = NR_IRQS;
> +  try_again:
> +	vector = assign_irq_vector(gsi);
> +
> +	/* Find the first IRQ using vector. */
> +	for (i = 0; i < NR_IRQS; i++)
> +		if (IO_APIC_VECTOR(i) == vector)
> +			break;
> +
> +	if (i >= NR_IRQS || platform_legacy_irq(i)) {
> +		if (--tries >= 0) {
> +			IO_APIC_VECTOR(gsi) = 0;
> +			goto try_again;
> +		}
> +		panic("gsi_irq_sharing: didn't find an IRQ 
> using vector 0x%02X for GSI %d", vector, gsi);
> +	}
> +	printk("GSI %d assigned vector 0x%02X and IRQ %d\n", 
> gsi, vector, i);
> +	gsi_2_irq[gsi] = i;
> +	return i;
> +}
> +
>  static int pin_2_irq(int idx, int apic, int pin)  {
>  	int irq, i;
> 
> 
> --
> James Cleverdon
> IBM LTC (xSeries Linux Solutions)
> {jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-11 13:15 Protasevich, Natalie
@ 2005-08-11 17:24 ` James Cleverdon
  0 siblings, 0 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-11 17:24 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: Andi Kleen, Russ Weight, linux-kernel

On Thursday 11 August 2005 06:15 am, Protasevich, Natalie wrote:
> > > The only problem is here:
> > >
> > > +	if (i < NR_IRQS) {
> > > +		gsi_2_irq[gsi] = i;
> > > +		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ
> > > %d\n",
> > > +				gsi, vector, i);
> > > +		return i;
> > > +	}
> > > +
> > > +	i = next_irq++;
> > >
> > > That means for any IRQ < NR_IRQS you allow it to be 
> > identity mapped, 
> > > with all the gaps, and only for ones exceeding 224 you'll assign 
> > > consecutive next_irqs++, whereas you can do it for all PCI 
> > IRQs above 
> > > 15. So, the alternative clause should probably come down to just:
> > >                  irq = next_irq++;
> > >                  gsi_2_irq[gsi] = irq; - which means just 
> > removing the 
> > > one above...
> > > (although we better test that :)...I will definitely test vector 
> > > sharing when manage to get on max configuration partition here.
> > >
> > > Regards,
> > > --Natalie
> > 
> > Blast.  You're right, because i is the results of scanning 
> > irq_vector (AKA IO_APIC_VECTOR()) for a previous use of the 
> > vector.  If i < NR_IRQS then it has found a previous use and 
> > would substitute the other IRQ number rather than allocate a new one.
> > 
> > Of course, one side effect of calling assign_irq_vector(gsi) 
> > is that the vector number is stored at position gsi in 
> > irq_vector.  So, for IRQs 0 to NR_IRQS - 1, it will always 
> > find itself.
> > 
> > OK, how about not fighting that side effect and only 
> > reassigning IRQs that are >= NR_IRQS?  Maybe something like this:
> 
> I am wondering where sharing_vectors gets set, it was not in the patch
> originally.

Oh, that.  Sorry, I added it to indicate when the vectors are all used
up.  Here:

@@ -667,14 +736,14 @@ int __init assign_irq_vector(int irq)
 	if (current_vector == IA32_SYSCALL_VECTOR)
 		goto next;
 
-	if (current_vector > FIRST_SYSTEM_VECTOR) {
-		offset++;
+	if (current_vector >= FIRST_SYSTEM_VECTOR) {
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
+		if (offset == 0)
+			sharing_vectors = 1;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	if (current_vector == FIRST_SYSTEM_VECTOR)
-		panic("ran out of interrupt sources!");
-
 	IO_APIC_VECTOR(irq) = current_vector;
 	return current_vector;
 }


Since we run out of vectors before IRQs, we need to start checking for
vector reuse once assign_irq_vector wrapped.  I'm not especially happy
about putting in a global flag variable, but the alternative was to
scan irq_vectors for duplicates on every IRQ.  This seemed wasteful
when most boxes will never use up either vectors or IRQs.

After sleeping on it, maybe the original code can be patched without
having to hack assign_irq_vector(), etc.  How about:

--- io_apic.c	2005-08-11 10:14:33.564748923 -0700
+++ io_apic.c.new	2005-08-11 10:15:55.412331115 -0700
@@ -617,7 +617,7 @@ int gsi_irq_sharing(int gsi)
 	 * than PCI.
 	 */
 	for (i = 0; i < NR_IRQS; i++)
-		if (IO_APIC_VECTOR(i) == vector) {
+		if (IO_APIC_VECTOR(i) == vector && i != gsi) {
 			if (!platform_legacy_irq(i))
 				break;			/* got one */
 			IO_APIC_VECTOR(gsi) = 0;



> > @@ -579,4 +589,57 @@ static inline int irq_trigger(int idx)
> >  	return MPBIOS_trigger(idx);
> >  }
> >  
> > +/*
> > + * gsi_irq_sharing -- Name overload!  "irq" can be either a 
> > legacy IRQ
> > + * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
> > + * from ACPI, which can easily reach 800.
> > + *
> > + * Compact the sparse GSI space into a available IRQs and reuse
> > + * vectors if possible.
> > + */
> > +int gsi_irq_sharing(int gsi)
> > +{
> > +	int i, tries, vector;
> > +
> > +	BUG_ON(gsi >= NR_IRQ_VECTORS);
> > +
> > +	/*
> > +	 * Don't share legacy IRQs.  Their trigger modes are 
> > usually edge
> > +	 * and PCI is level.  Mixed modes are trouble.  Only 
> > big boxes are
> > +	 * likely to overflow IRQs or to share vectors.
> > +	 */
> > +	if (likely(gsi < NR_IRQS && !sharing_vectors) || 
> > platform_legacy_irq(gsi)) {
> > +		gsi_2_irq[gsi] = gsi;
> > +		return gsi;
> > +	}
> > +
> > +	if (gsi_2_irq[gsi] != 0xFF)
> > +		return gsi_2_irq[gsi];
> > +	/*
> > +	 * Ran out of vectors or IRQ >= NR_IRQS.  Sharing vectors
> > +	 * means sharing IRQs, so scan irq_vectors for previous use
> > +	 * of vector and return that IRQ.
> > +	 */
> > +	tries = NR_IRQS;
> > +  try_again:
> > +	vector = assign_irq_vector(gsi);
> > +
> > +	/* Find the first IRQ using vector. */
> > +	for (i = 0; i < NR_IRQS; i++)
> > +		if (IO_APIC_VECTOR(i) == vector)
> > +			break;
> > +
> > +	if (i >= NR_IRQS || platform_legacy_irq(i)) {
> > +		if (--tries >= 0) {
> > +			IO_APIC_VECTOR(gsi) = 0;
> > +			goto try_again;
> > +		}
> > +		panic("gsi_irq_sharing: didn't find an IRQ 
> > using vector 0x%02X for GSI %d", vector, gsi);
> > +	}
> > +	printk("GSI %d assigned vector 0x%02X and IRQ %d\n", 
> > gsi, vector, i);
> > +	gsi_2_irq[gsi] = i;
> > +	return i;
> > +}
> > +
> >  static int pin_2_irq(int idx, int apic, int pin)  {
> >  	int irq, i;
> > 
> > 
> > --
> > James Cleverdon
> > IBM LTC (xSeries Linux Solutions)
> > {jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm
> > 
> 
> 
> 

-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-10 21:03 Protasevich, Natalie
  2005-08-10 23:55 ` James Cleverdon
@ 2005-08-11 17:52 ` Zwane Mwaikambo
  1 sibling, 0 replies; 21+ messages in thread
From: Zwane Mwaikambo @ 2005-08-11 17:52 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: jamesclv, Andi Kleen, Brown, Len, linux-kernel

On Wed, 10 Aug 2005, Protasevich, Natalie wrote:

> our systems we are just about to use up all 224 interrupts, but not
> quiet. 
> I have to mention that as far as I know Zwane is about to release his
> vector sharing mechanism, he had it implemented and working for i386 (I
> tested it on ES7000 successfully, by itself and combined with
> compression patch too), and was planning implementing it for x86_64. I
> am officially volunteering for testing it in its present state, for both
> i386 and x86_64 (I can still do this on our systems by removing the IRQ
> compression code :), hope this will help Zwane and Andi to release it as
> soon as possible.

I added some of the suggestions brought forward (dynamically allocated 
IDTs, percpu IDT) last night, all that's left is MSI, which does work 
right now, but gets all its vectors allocated on the first irq handling 
domain. I should be done soon, time permitting.

Thanks,
	Zwane


^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
@ 2005-08-11 21:55 Protasevich, Natalie
  2005-08-12  1:07 ` James Cleverdon
  2005-08-12  2:59 ` James Cleverdon
  0 siblings, 2 replies; 21+ messages in thread
From: Protasevich, Natalie @ 2005-08-11 21:55 UTC (permalink / raw)
  To: jamesclv; +Cc: Andi Kleen, Russ Weight, linux-kernel

> After sleeping on it, maybe the original code can be patched 
> without having to hack assign_irq_vector(), etc.  How about:
> 
> --- io_apic.c	2005-08-11 10:14:33.564748923 -0700
> +++ io_apic.c.new	2005-08-11 10:15:55.412331115 -0700
> @@ -617,7 +617,7 @@ int gsi_irq_sharing(int gsi)
>  	 * than PCI.
>  	 */
>  	for (i = 0; i < NR_IRQS; i++)
> -		if (IO_APIC_VECTOR(i) == vector) {
> +		if (IO_APIC_VECTOR(i) == vector && i != gsi) {
>  			if (!platform_legacy_irq(i))
>  				break;			/* got one */
>  			IO_APIC_VECTOR(gsi) = 0;
> 
> 
Yes that did it, on my small system it looked just right:

<7>IRQ to pin mappings:
<7>IRQ0 -> 0:2
<7>IRQ1 -> 0:1
<7>IRQ3 -> 0:3
<7>IRQ4 -> 0:4
<7>IRQ5 -> 0:5
<7>IRQ6 -> 0:6
<7>IRQ7 -> 0:7
<7>IRQ8 -> 0:8
<7>IRQ9 -> 0:9
<7>IRQ10 -> 0:10
<7>IRQ11 -> 0:11
<7>IRQ12 -> 0:12
<7>IRQ14 -> 0:14
<7>IRQ15 -> 0:15
<7>IRQ16 -> 0:16
<7>IRQ17 -> 0:17
<7>IRQ18 -> 0:18
<7>IRQ19 -> 0:19
<7>IRQ20 -> 0:20
<7>IRQ21 -> 0:23
<7>IRQ22 -> 1:2
<7>IRQ23 -> 1:3
<7>IRQ24 -> 1:4
<7>IRQ25 -> 1:5
<7>IRQ26 -> 2:0
<7>IRQ27 -> 2:1
<7>IRQ28 -> 2:2
<7>IRQ29 -> 2:3
<7>IRQ30 -> 2:4
<7>IRQ31 -> 2:5
<7>IRQ32 -> 2:6
<7>IRQ33 -> 2:7
<7>IRQ34 -> 2:8
:!cat /proc/interrupts
           CPU0       CPU1       CPU2       CPU3
  0:      12621      15007      12781      20921    IO-APIC-edge  timer
  1:         72          0          2        175    IO-APIC-edge  i8042
  2:          0          0          0          0          XT-PIC
cascade
  8:          0          0          0          1    IO-APIC-edge  rtc
  9:          0          0          0          0    IO-APIC-edge  acpi
 12:          4        272          0        110    IO-APIC-edge  i8042
 15:          4          0          0         39    IO-APIC-edge  ide1
 16:          0          0          0          0   IO-APIC-level
uhci_hcd:usb1, uhci_hcd:usb4
 17:          0          0          0          2   IO-APIC-level
ohci1394
 18:        730       2407        932       2083   IO-APIC-level
libata, uhci_hcd:usb3
 19:          0          0          0          0   IO-APIC-level
uhci_hcd:usb2
 21:          0          0          0          0   IO-APIC-level
ehci_hcd:usb5
 26:        416          0          0          4   IO-APIC-level  eth0
NMI:        116         71         73         51
LOC:      61280      61258      61236      61214
ERR:          3
MIS:          0

Looks good! I will try the patch also on the ES7000 hopefully big enough
to exercise some vector sharing.

Regards,
--Natalie

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
@ 2005-08-11 22:02 Protasevich, Natalie
  2005-08-11 22:34 ` Zwane Mwaikambo
  0 siblings, 1 reply; 21+ messages in thread
From: Protasevich, Natalie @ 2005-08-11 22:02 UTC (permalink / raw)
  To: Zwane Mwaikambo; +Cc: Andi Kleen, Brown, Len, linux-kernel

> On Wed, 10 Aug 2005, Protasevich, Natalie wrote:
> 
> > our systems we are just about to use up all 224 interrupts, but not 
> > quiet.
> > I have to mention that as far as I know Zwane is about to 
> release his 
> > vector sharing mechanism, he had it implemented and working 
> for i386 
> > (I tested it on ES7000 successfully, by itself and combined with 
> > compression patch too), and was planning implementing it 
> for x86_64. I 
> > am officially volunteering for testing it in its present state, for 
> > both
> > i386 and x86_64 (I can still do this on our systems by removing the 
> > IRQ compression code :), hope this will help Zwane and Andi 
> to release 
> > it as soon as possible.
> 
> I added some of the suggestions brought forward (dynamically 
> allocated IDTs, percpu IDT) last night, all that's left is 
> MSI, which does work right now, but gets all its vectors 
> allocated on the first irq handling domain. I should be done 
> soon, time permitting.

Zwane, please let me know when I can try it on ES7000, even work in
progress if you need it (see above about volunteering :)

Regards,
--Natalie 
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-11 22:02 Protasevich, Natalie
@ 2005-08-11 22:34 ` Zwane Mwaikambo
  0 siblings, 0 replies; 21+ messages in thread
From: Zwane Mwaikambo @ 2005-08-11 22:34 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: Andi Kleen, Brown, Len, Linux Kernel

On Thu, 11 Aug 2005, Protasevich, Natalie wrote:

> > I added some of the suggestions brought forward (dynamically 
> > allocated IDTs, percpu IDT) last night, all that's left is 
> > MSI, which does work right now, but gets all its vectors 
> > allocated on the first irq handling domain. I should be done 
> > soon, time permitting.
> 
> Zwane, please let me know when I can try it on ES7000, even work in
> progress if you need it (see above about volunteering :)

Certainly and thanks for volunteering, here is what i had booting last 
night. There are some things which i need to resolve, for example 
allocations from __alloc_percpu don't seem to be cacheline aligned, let 
alone 2k (as i'd expect for a 2k allocation). IDTs are now per cpu, but 
the policy for distributing which cpus service which devices is still on a 
node basis. You may also need to ramp up NR_IRQS for the ES7000 subarch, 
what is a good number?

Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/apic.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/apic.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 apic.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/apic.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/apic.c	11 Aug 2005 03:39:33 -0000
@@ -78,15 +78,15 @@ void __init apic_intr_init(void)
 	smp_intr_init();
 #endif
 	/* self generated IPI for local APIC timer */
-	set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+	boot_set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
 	/* IPI vectors for APIC spurious and error interrupts */
-	set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+	boot_set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	boot_set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 
 	/* thermal monitor LVT interrupt */
 #ifdef CONFIG_X86_MCE_P4THERMAL
-	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	boot_set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
 }
 
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/entry.S
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/entry.S,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 entry.S
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/entry.S	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/entry.S	7 Aug 2005 01:27:16 -0000
@@ -416,27 +416,18 @@ syscall_badsys:
 	FIXUP_ESPFIX_STACK \
 28:	popl %eax;
 
-/*
- * Build the entry stubs and pointer table with
- * some assembler magic.
- */
-.data
-ENTRY(interrupt)
-.text
-
+/* Build the IRQ entry stubs */
 vector=0
-ENTRY(irq_entries_start)
+	.align IRQ_STUB_SIZE,0x90
+ENTRY(interrupt)
 .rept NR_IRQS
 	ALIGN
-1:	pushl $vector-256
+	pushl $vector-0x10000
 	jmp common_interrupt
-.data
-	.long 1b
-.text
+	.align IRQ_STUB_SIZE,0x90
 vector=vector+1
 .endr
 
-	ALIGN
 common_interrupt:
 	SAVE_ALL
 	movl %esp,%eax
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/head.S
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/head.S,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 head.S
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/head.S	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/head.S	11 Aug 2005 02:44:06 -0000
@@ -11,6 +11,7 @@
 #include <linux/config.h>
 #include <linux/threads.h>
 #include <linux/linkage.h>
+#include <linux/numa.h>
 #include <asm/segment.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -304,7 +305,7 @@ is386:	movl $2,%ecx		# set MP
 
 	call check_x87
 	lgdt cpu_gdt_descr
-	lidt idt_descr
+	lidt cpu_idt_descr		# we switch to per cpu IDTs later
 	ljmp $(__KERNEL_CS),$1f
 1:	movl $(__KERNEL_DS),%eax	# reload all the segment registers
 	movl %eax,%ss			# after changing gdt.
@@ -370,7 +371,7 @@ setup_idt:
 	movw %dx,%ax		/* selector = 0x0010 = cs */
 	movw $0x8E00,%dx	/* interrupt gate - dpl=0, present */
 
-	lea idt_table,%edi
+	lea boot_idt_table,%edi
 	mov $256,%ecx
 rp_sidt:
 	movl %eax,(%edi)
@@ -445,7 +446,7 @@ int_msg:
  */
 
 .globl boot_gdt_descr
-.globl idt_descr
+.globl cpu_idt_descr
 .globl cpu_gdt_descr
 
 	ALIGN
@@ -456,9 +457,10 @@ boot_gdt_descr:
 	.long boot_gdt_table - __PAGE_OFFSET
 
 	.word 0				# 32-bit align idt_desc.address
-idt_descr:
+cpu_idt_descr:
 	.word IDT_ENTRIES*8-1		# idt contains 256 entries
-	.long idt_table
+	.long boot_idt_table
+	.fill NR_CPUS-1,8,0
 
 # boot GDT descriptor (later on used by CPU#0):
 	.word 0				# 32 bit align gdt_desc.address
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/i8259.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/i8259.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 i8259.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/i8259.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/i8259.c	11 Aug 2005 03:56:23 -0000
@@ -412,12 +412,12 @@ void __init init_IRQ(void)
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+	for (i = 0; i < (NR_DEVICE_VECTORS); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
 		if (i >= NR_IRQS)
 			break;
 		if (vector != SYSCALL_VECTOR) 
-			set_intr_gate(vector, interrupt[i]);
+			boot_set_intr_gate(vector, interrupt[i]);
 	}
 
 	/* setup after call gates are initialised (usually add in
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/io_apic.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/io_apic.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 io_apic.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/io_apic.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/io_apic.c	11 Aug 2005 05:34:40 -0000
@@ -78,14 +78,16 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_IRQ_NODES][NR_VECTORS] =
+	{ [0 ... NR_IRQ_NODES-1][0 ... NR_VECTORS - 1] = -1 };
 #ifdef CONFIG_PCI_MSI
-#define vector_to_irq(vector) 	\
-	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#define vector_to_irq(node, vector)    \
+	(platform_legacy_irq(vector) ? vector : vector_irq[node][vector])
 #else
-#define vector_to_irq(vector)	(vector)
+#define vector_to_irq(cpu, vector)	(vector)
 #endif
 
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -1120,31 +1122,43 @@ static inline int IO_APIC_irq_trigger(in
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 vector_allocated[NR_IRQ_NODES][FIRST_SYSTEM_VECTOR];
 
-int assign_irq_vector(int irq)
+int assign_irq_vector(int irq, int node)
 {
-	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+	static u8 current_vector[NR_IRQ_NODES] = {[0 ... NR_IRQ_NODES-1] =
+		FIRST_DEVICE_VECTOR};
+	static int offset[NR_IRQ_NODES];
+	int vector;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
-		return IO_APIC_VECTOR(irq);
+	vector = IO_APIC_VECTOR(irq);
+	if ((vector > 0) && (irq != AUTO_ASSIGN)) {
+		vector_allocated[node][vector] = 1;
+		return vector;
+	}
 next:
-	current_vector += 8;
-	if (current_vector == SYSCALL_VECTOR)
+	current_vector[node] += 8;
+	if (current_vector[node] == SYSCALL_VECTOR)
 		goto next;
-
-	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
-		current_vector = FIRST_DEVICE_VECTOR + offset;
+	
+	if (current_vector[node] >= FIRST_SYSTEM_VECTOR) {
+		offset[node] = (offset[node] + 1) & 7;
+		current_vector[node] = FIRST_DEVICE_VECTOR + offset[node];
 	}
 
-	vector_irq[current_vector] = irq;
+	if (current_vector[node] == FIRST_SYSTEM_VECTOR)
+		return -ENOSPC;
+
+	vector = current_vector[node];
+	vector_irq[node][vector] = irq;
+	if (vector_allocated[node][vector])
+		goto next;
+	
+	vector_allocated[node][vector] = 1;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vector;
 
-	return current_vector;
+	return vector;
 }
 
 static struct hw_interrupt_type ioapic_level_type;
@@ -1154,7 +1168,7 @@ static struct hw_interrupt_type ioapic_e
 #define IOAPIC_EDGE	0
 #define IOAPIC_LEVEL	1
 
-static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger)
+static inline void ioapic_register_intr(int node, int irq, int vector, unsigned long trigger)
 {
 	if (use_pci_vector() && !platform_legacy_irq(irq)) {
 		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
@@ -1162,25 +1176,25 @@ static inline void ioapic_register_intr(
 			irq_desc[vector].handler = &ioapic_level_type;
 		else
 			irq_desc[vector].handler = &ioapic_edge_type;
-		set_intr_gate(vector, interrupt[vector]);
+		node_set_intr_gate(node, vector, interrupt[vector]);
 	} else	{
 		if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
 				trigger == IOAPIC_LEVEL)
 			irq_desc[irq].handler = &ioapic_level_type;
 		else
 			irq_desc[irq].handler = &ioapic_edge_type;
-		set_intr_gate(vector, interrupt[irq]);
+		node_set_intr_gate(node, vector, interrupt[irq]);
 	}
 }
 
 static void __init setup_IO_APIC_irqs(void)
 {
 	struct IO_APIC_route_entry entry;
-	int apic, pin, idx, irq, first_notcon = 1, vector;
+	int apic, pin, idx, irq, first_notcon = 1, vector, bus, node;
 	unsigned long flags;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
+	
 	for (apic = 0; apic < nr_ioapics; apic++) {
 	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 
@@ -1192,8 +1206,6 @@ static void __init setup_IO_APIC_irqs(vo
 		entry.delivery_mode = INT_DELIVERY_MODE;
 		entry.dest_mode = INT_DEST_MODE;
 		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest = 
-					cpu_mask_to_apicid(TARGET_CPUS);
 
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
@@ -1212,12 +1224,22 @@ static void __init setup_IO_APIC_irqs(vo
 		entry.trigger = irq_trigger(idx);
 		entry.polarity = irq_polarity(idx);
 
+		bus = mp_irqs[idx].mpc_srcbus;
+		node = mp_bus_id_to_node[bus];
+		entry.dest.logical.logical_dest = cpu_mask_to_apicid(node_to_cpumask(node));
+
 		if (irq_trigger(idx)) {
 			entry.trigger = 1;
 			entry.mask = 1;
 		}
 
 		irq = pin_2_irq(idx, apic, pin);
+		if (irq >= NR_IRQS) {
+			apic_printk(APIC_VERBOSE, KERN_DEBUG
+				"IO-APIC: out of IRQS node%d/bus%d/ioapic%d/irq%d\n",
+					node, bus, apic, irq);
+			continue;
+		}
 		/*
 		 * skip adding the timer int on secondary nodes, which causes
 		 * a small but painful rift in the time-space continuum
@@ -1231,9 +1253,12 @@ static void __init setup_IO_APIC_irqs(vo
 			continue;
 
 		if (IO_APIC_IRQ(irq)) {
-			vector = assign_irq_vector(irq);
+			vector = assign_irq_vector(irq, node);
+			if (vector < 0)
+				continue;
+
 			entry.vector = vector;
-			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+			ioapic_register_intr(node, irq, vector, IOAPIC_AUTO);
 		
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
@@ -1928,14 +1953,14 @@ static void end_level_ioapic_irq (unsign
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
-	int irq = vector_to_irq(vector);
+	int irq = vector_to_irq(cpu_to_node(smp_processor_id()), vector);
 
 	return startup_edge_ioapic_irq(irq);
 }
 
 static void ack_edge_ioapic_vector(unsigned int vector)
 {
-	int irq = vector_to_irq(vector);
+	int irq = vector_to_irq(cpu_to_node(smp_processor_id()), vector);
 
 	move_irq(vector);
 	ack_edge_ioapic_irq(irq);
@@ -1943,14 +1968,14 @@ static void ack_edge_ioapic_vector(unsig
 
 static unsigned int startup_level_ioapic_vector (unsigned int vector)
 {
-	int irq = vector_to_irq(vector);
+	int irq = vector_to_irq(cpu_to_node(smp_processor_id()), vector);
 
 	return startup_level_ioapic_irq (irq);
 }
 
 static void end_level_ioapic_vector (unsigned int vector)
 {
-	int irq = vector_to_irq(vector);
+	int irq = vector_to_irq(cpu_to_node(smp_processor_id()), vector);
 
 	move_irq(vector);
 	end_level_ioapic_irq(irq);
@@ -1958,14 +1983,14 @@ static void end_level_ioapic_vector (uns
 
 static void mask_IO_APIC_vector (unsigned int vector)
 {
-	int irq = vector_to_irq(vector);
+	int irq = vector_to_irq(cpu_to_node(smp_processor_id()), vector);
 
 	mask_IO_APIC_irq(irq);
 }
 
 static void unmask_IO_APIC_vector (unsigned int vector)
 {
-	int irq = vector_to_irq(vector);
+	int irq = vector_to_irq(cpu_to_node(smp_processor_id()), vector);
 
 	unmask_IO_APIC_irq(irq);
 }
@@ -1974,7 +1999,8 @@ static void unmask_IO_APIC_vector (unsig
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
-	int irq = vector_to_irq(vector);
+	int node = cpu_to_node(first_cpu(cpu_mask));
+	int irq = vector_to_irq(node, vector);
 
 	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
@@ -2035,7 +2061,7 @@ static inline void init_IO_APIC_traps(vo
 		int tmp = irq;
 		if (use_pci_vector()) {
 			if (!platform_legacy_irq(tmp))
-				if ((tmp = vector_to_irq(tmp)) == -1)
+				if ((tmp = vector_to_irq(0, tmp)) == -1) /* FIXME - zwane */
 					continue;
 		}
 		if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
@@ -2181,7 +2207,8 @@ static inline void check_timer(void)
 	 * get/set the timer IRQ vector:
 	 */
 	disable_8259A_irq(0);
-	vector = assign_irq_vector(0);
+	vector = assign_irq_vector(0, cpu_to_node(smp_processor_id()));
+	/* This gets reserved on all nodes as FIRST_DEVICE_VECTOR */
 	set_intr_gate(vector, interrupt[0]);
 
 	/*
@@ -2528,6 +2555,7 @@ int io_apic_set_pci_routing (int ioapic,
 {
 	struct IO_APIC_route_entry entry;
 	unsigned long flags;
+	int node, bus;
 
 	if (!IO_APIC_IRQ(irq)) {
 		printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -2545,7 +2573,6 @@ int io_apic_set_pci_routing (int ioapic,
 
 	entry.delivery_mode = INT_DELIVERY_MODE;
 	entry.dest_mode = INT_DEST_MODE;
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
 	entry.trigger = edge_level;
 	entry.polarity = active_high_low;
 	entry.mask  = 1;
@@ -2555,15 +2582,19 @@ int io_apic_set_pci_routing (int ioapic,
 	 */
 	if (irq >= 16)
 		add_pin_to_irq(irq, ioapic, pin);
-
-	entry.vector = assign_irq_vector(irq);
+	bus = mp_irqs[pin].mpc_srcbus;
+	node = mp_bus_id_to_node[bus];
+	entry.dest.logical.logical_dest = cpu_mask_to_apicid(node_to_cpumask(node));
+	entry.vector = assign_irq_vector(irq, node);
+	if (entry.vector < 0)
+		return -ENOSPC;
 
 	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
 		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
 		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);
 
-	ioapic_register_intr(irq, entry.vector, edge_level);
+	ioapic_register_intr(node, irq, entry.vector, edge_level);
 
 	if (!ioapic && (irq < 16))
 		disable_8259A_irq(irq);
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/irq.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/irq.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 irq.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/irq.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/irq.c	7 Aug 2005 01:00:26 -0000
@@ -53,8 +53,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
  */
 fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {	
-	/* high bits used in ret_from_ code */
-	int irq = regs->orig_eax & 0xff;
+	int irq = regs->orig_eax & 0xffff;
 #ifdef CONFIG_4KSTACKS
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/smpboot.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/smpboot.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 smpboot.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/smpboot.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/smpboot.c	11 Aug 2005 04:18:41 -0000
@@ -53,6 +53,7 @@
 #include <asm/tlbflush.h>
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
+#include <asm/cpu.h>
 
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
@@ -87,6 +88,7 @@ EXPORT_SYMBOL(cpu_online_map);
 
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
 EXPORT_SYMBOL(cpu_callout_map);
 static cpumask_t smp_commenced_mask;
 
@@ -1079,6 +1081,9 @@ static void __init smp_boot_cpus(unsigne
 	int apicid, cpu, bit, kicked;
 	unsigned long bogosum = 0;
 
+	/* prepare per CPU IDTs */
+	setup_idts();
+
 	/*
 	 * Setup boot CPU information
 	 */
@@ -1383,17 +1387,17 @@ void __init smp_intr_init(void)
 	 * IRQ0 must be given a fixed assignment and initialized,
 	 * because it's used before the IO-APIC is set up.
 	 */
-	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+	boot_set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
 
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
 	 */
-	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+	boot_set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
 
 	/* IPI for invalidation */
-	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+	boot_set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
 
 	/* IPI for generic function call */
-	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+	boot_set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 }
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/traps.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/traps.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 traps.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/traps.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/traps.c	11 Aug 2005 05:32:56 -0000
@@ -51,6 +51,7 @@
 #include <asm/smp.h>
 #include <asm/arch_hooks.h>
 #include <asm/kdebug.h>
+#include <asm/cpu.h>
 
 #include <linux/irq.h>
 #include <linux/module.h>
@@ -70,7 +71,8 @@ char ignore_fpu_irq = 0;
  * F0 0F bug workaround.. We have a special link segment
  * for this.
  */
-struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, };
+struct desc_struct *cpu_idt_table;
+struct desc_struct __initdata boot_idt_table[IDT_ENTRIES];
 
 asmlinkage void divide_error(void);
 asmlinkage void debug(void);
@@ -1090,14 +1092,16 @@ asmlinkage void math_emulate(long arg)
 #ifdef CONFIG_X86_F00F_BUG
 void __init trap_init_f00f_bug(void)
 {
-	__set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+	int cpu = smp_processor_id();
+
+	__set_fixmap(FIX_F00F_IDT, __pa(boot_idt_table), PAGE_KERNEL_RO);
 
 	/*
 	 * Update the IDT descriptor and reload the IDT so that
 	 * it uses the read-only mapped virtual address.
 	 */
-	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
-	load_idt(&idt_descr);
+	cpu_idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+	load_idt(&cpu_idt_descr[cpu]);
 }
 #endif
 
@@ -1116,14 +1120,38 @@ do { \
 
 
 /*
- * This needs to use 'idt_table' rather than 'idt', and
+ * This needs to use 'cpu_idt_table' rather than 'idt', and
  * thus use the _nonmapped_ version of the IDT, as the
  * Pentium F0 0F bugfix can have resulted in the mapped
  * IDT being write-protected.
  */
+void node_set_intr_gate(unsigned int node, unsigned int n, void *addr)
+{
+	cpumask_t mask;
+	int cpu;
+	struct desc_struct *idt_table;
+
+	mask = node_to_cpumask(node);
+	for_each_cpu_mask(cpu, mask) {
+		idt_table = per_cpu_ptr(cpu_idt_table, cpu);
+		_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+	}
+}
+
 void set_intr_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+	int cpu;
+	struct desc_struct *idt_table;
+
+	for_each_cpu(cpu) {
+		idt_table = per_cpu_ptr(cpu_idt_table, cpu);
+		_set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+	}
+}
+
+void __init boot_set_intr_gate(unsigned int n, void *addr)
+{
+	_set_gate(&boot_idt_table[n],14,0,addr,__KERNEL_CS);
 }
 
 /*
@@ -1129,25 +1157,27 @@ void set_intr_gate(unsigned int n, void 
 /*
  * This routine sets up an interrupt gate at directory privilege level 3.
  */
+
 static inline void set_system_intr_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+	_set_gate(&boot_idt_table[n],14, 3, addr, __KERNEL_CS);
 }
 
 static void __init set_trap_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+	_set_gate(&boot_idt_table[n],15,0,addr,__KERNEL_CS);
 }
 
 static void __init set_system_gate(unsigned int n, void *addr)
 {
-	_set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+	_set_gate(&boot_idt_table[n],15,3,addr,__KERNEL_CS);
 }
 
 static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
 {
-	_set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+	_set_gate(&boot_idt_table[n],5,0,0,(gdt_entry<<3));
 }
+
 #ifdef CONFIG_KGDB
 void set_intr_usr_gate(unsigned int n, void *addr)
 {
@@ -1169,10 +1199,9 @@ void __init trap_init(void)
 #ifdef CONFIG_X86_LOCAL_APIC
 	init_apic_mappings();
 #endif
-
 	set_trap_gate(0,&divide_error);
-	set_intr_gate(1,&debug);
-	set_intr_gate(2,&nmi);
+	boot_set_intr_gate(1,&debug);
+	boot_set_intr_gate(2,&nmi);
 #ifndef CONFIG_KGDB
 	set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
 #else
@@ -1188,7 +1217,7 @@ void __init trap_init(void)
 	set_trap_gate(11,&segment_not_present);
 	set_trap_gate(12,&stack_segment);
 	set_trap_gate(13,&general_protection);
-	set_intr_gate(14,&page_fault);
+	boot_set_intr_gate(14,&page_fault);
 	set_trap_gate(15,&spurious_interrupt_bug);
 	set_trap_gate(16,&coprocessor_error);
 	set_trap_gate(17,&alignment_check);
Index: linux-2.6.13-rc4-mm1/arch/i386/kernel/cpu/common.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/kernel/cpu/common.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 common.c
--- linux-2.6.13-rc4-mm1/arch/i386/kernel/cpu/common.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/kernel/cpu/common.c	11 Aug 2005 05:19:16 -0000
@@ -562,10 +562,30 @@ void __init early_cpu_init(void)
 	disable_pse = 1;
 #endif
 }
+
+/*
+ * allocate space for all the IDTs in the system and copy over
+ * the boot IDT table to the runtime one. Individual cpu IDTs
+ * will be done at cpu_init
+ */
+void __init setup_idts(void)
+{
+	int cpu = smp_processor_id();
+
+	cpu_idt_table = __alloc_percpu(IDT_SIZE, IDT_SIZE);
+
+	memcpy(per_cpu_ptr(cpu_idt_table, cpu), boot_idt_table, IDT_SIZE);
+	cpu_idt_descr[cpu].size = IDT_SIZE - 1;
+	cpu_idt_descr[cpu].address = (unsigned long)per_cpu_ptr(cpu_idt_table, cpu);
+
+	/* switch cpu0's IDT */
+	load_idt(&cpu_idt_descr[cpu]);
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
+ * We reload them nevertheless, this function acts as a
  * 'CPU state barrier', nothing should get across.
  */
 void __devinit cpu_init(void)
@@ -607,6 +627,16 @@ void __devinit cpu_init(void)
 	cpu_gdt_descr[cpu].address =
 	    (unsigned long)&per_cpu(cpu_gdt_table, cpu);
 
+	/* Skip the BSP, it'll still use the boot IDT until runtime IDTs are
+	 * allocated later on.
+	 */
+	if (cpu) {
+		memcpy(per_cpu_ptr(cpu_idt_table, cpu), boot_idt_table, IDT_SIZE);
+		cpu_idt_descr[cpu].size = IDT_SIZE - 1;
+		cpu_idt_descr[cpu].address =
+			(unsigned long)per_cpu_ptr(cpu_idt_table, cpu);
+	}
+
 	/*
 	 * Set up the per-thread TLS descriptor cache:
 	 */
@@ -614,7 +644,8 @@ void __devinit cpu_init(void)
 		GDT_ENTRY_TLS_ENTRIES * 8);
 
 	load_gdt(&cpu_gdt_descr[cpu]);
-	load_idt(&idt_descr);
+	printk("CPU%d IDT at %lx\n", cpu, cpu_idt_descr[cpu].address);
+	load_idt(&cpu_idt_descr[cpu]);
 
 	/*
 	 * Delete NT
Index: linux-2.6.13-rc4-mm1/arch/i386/mach-voyager/voyager_smp.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/mach-voyager/voyager_smp.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 voyager_smp.c
--- linux-2.6.13-rc4-mm1/arch/i386/mach-voyager/voyager_smp.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/mach-voyager/voyager_smp.c	11 Aug 2005 03:40:42 -0000
@@ -1387,9 +1387,9 @@ setup_profiling_timer(unsigned int multi
  *  boot sequence interferes with bug checking; enable them later
  *  on in smp_init */
 #define VIC_SET_GATE(cpi, vector) \
-	set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector))
+	boot_set_intr_gate((cpi) + VIC_DEFAULT_CPI_BASE, (vector))
 #define QIC_SET_GATE(cpi, vector) \
-	set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
+	boot_set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector))
 
 void __init
 smp_intr_init(void)
Index: linux-2.6.13-rc4-mm1/arch/i386/mm/fault.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/arch/i386/mm/fault.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 fault.c
--- linux-2.6.13-rc4-mm1/arch/i386/mm/fault.c	6 Aug 2005 18:46:41 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/arch/i386/mm/fault.c	7 Aug 2005 00:40:40 -0000
@@ -409,9 +409,9 @@ bad_area_nosemaphore:
 	 * Pentium F0 0F C7 C8 bug workaround.
 	 */
 	if (boot_cpu_data.f00f_bug) {
-		unsigned long nr;
-		
-		nr = (address - idt_descr.address) >> 3;
+		unsigned long nr, node;
+		node = cpu_to_node(smp_processor_id());
+		nr = (address - node_idt_descr[node].address) >> 3;
 
 		if (nr == 6) {
 			do_invalid_op(regs, 0);
Index: linux-2.6.13-rc4-mm1/drivers/pci/msi.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/drivers/pci/msi.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 msi.c
--- linux-2.6.13-rc4-mm1/drivers/pci/msi.c	6 Aug 2005 18:46:32 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/drivers/pci/msi.c	11 Aug 2005 06:10:57 -0000
@@ -34,7 +34,7 @@ static int nr_reserved_vectors = NR_HP_R
 static int nr_msix_devices;
 
 #ifndef CONFIG_X86_IO_APIC
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_IRQ_NODES][NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 #endif
 
@@ -270,7 +270,7 @@ static void msi_address_init(struct msg_
 }
 
 static int msi_free_vector(struct pci_dev* dev, int vector, int reassign);
-static int assign_msi_vector(void)
+static int assign_msi_vector(int node)
 {
 	static int new_vector_avail = 1;
 	int vector;
@@ -299,7 +299,7 @@ static int assign_msi_vector(void)
 		 * vector-to-IOxAPIC IRQ mapping.
 	 	 */
 		for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) {
-			if (vector_irq[vector] != 0)
+			if (vector_irq[node][vector] != 0)
 				continue;
 			free_vector = vector;
 			if (!msi_desc[vector])
@@ -311,7 +311,7 @@ static int assign_msi_vector(void)
 			spin_unlock_irqrestore(&msi_lock, flags);
 			return -EBUSY;
 		}
-		vector_irq[free_vector] = -1;
+		vector_irq[node][free_vector] = -1;
 		nr_released_vectors--;
 		spin_unlock_irqrestore(&msi_lock, flags);
 		if (msi_desc[free_vector] != NULL) {
@@ -330,7 +330,7 @@ static int assign_msi_vector(void)
 
 		return free_vector;
 	}
-	vector = assign_irq_vector(AUTO_ASSIGN);
+	vector = assign_irq_vector(AUTO_ASSIGN, 0); /* FIXME - Zwane */
 	last_alloc_vector = vector;
 	if (vector  == LAST_DEVICE_VECTOR)
 		new_vector_avail = 0;
@@ -341,10 +341,10 @@ static int assign_msi_vector(void)
 
 static int get_new_vector(void)
 {
-	int vector;
+	int vector, node = 0;
 
-	if ((vector = assign_msi_vector()) > 0)
-		set_intr_gate(vector, interrupt[vector]);
+	if ((vector = assign_msi_vector(node)) > 0)
+		set_intr_gate(vector, interrupt[vector]); /* FIXME - Zwane */
 
 	return vector;
 }
@@ -352,6 +352,7 @@ static int get_new_vector(void)
 static int msi_init(void)
 {
 	static int status = -ENOMEM;
+	int node = 0;
 
 	if (!status)
 		return status;
@@ -368,14 +369,14 @@ static int msi_init(void)
 		printk(KERN_WARNING "PCI: MSI cache init failed\n");
 		return status;
 	}
-	last_alloc_vector = assign_irq_vector(AUTO_ASSIGN);
+	last_alloc_vector = assign_irq_vector(AUTO_ASSIGN, 0); /* FIXME - Zwane */
 	if (last_alloc_vector < 0) {
 		pci_msi_enable = 0;
 		printk(KERN_WARNING "PCI: No interrupt vectors available for MSI\n");
 		status = -EBUSY;
 		return status;
 	}
-	vector_irq[last_alloc_vector] = 0;
+	vector_irq[node][last_alloc_vector] = 0;
 	nr_released_vectors++;
 
 	return status;
@@ -686,7 +687,7 @@ static int msix_capability_init(struct p
  **/
 int pci_enable_msi(struct pci_dev* dev)
 {
-	int pos, temp, status = -EINVAL;
+	int pos, temp, node, status = -EINVAL;
 	u16 control;
 
 	if (!pci_msi_enable || !dev)
@@ -704,14 +705,15 @@ int pci_enable_msi(struct pci_dev* dev)
 	if (control & PCI_MSI_FLAGS_ENABLE)
 		return 0;			/* Already in MSI mode */
 
+	node = pcibus_to_node(dev->bus->number);
 	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) {
 		/* Lookup Sucess */
 		unsigned long flags;
 
 		spin_lock_irqsave(&msi_lock, flags);
-		if (!vector_irq[dev->irq]) {
+		if (!vector_irq[node][dev->irq]) {
 			msi_desc[dev->irq]->msi_attrib.state = 0;
-			vector_irq[dev->irq] = -1;
+			vector_irq[node][dev->irq] = -1;
 			nr_released_vectors--;
 			spin_unlock_irqrestore(&msi_lock, flags);
 			enable_msi_mode(dev, pos, PCI_CAP_ID_MSI);
@@ -744,7 +746,7 @@ int pci_enable_msi(struct pci_dev* dev)
 void pci_disable_msi(struct pci_dev* dev)
 {
 	struct msi_desc *entry;
-	int pos, default_vector;
+	int pos, default_vector, node;
 	u16 control;
 	unsigned long flags;
 
@@ -761,6 +763,7 @@ void pci_disable_msi(struct pci_dev* dev
 		spin_unlock_irqrestore(&msi_lock, flags);
 		return;
 	}
+	node = pcibus_to_node(dev->bus->number);
 	if (entry->msi_attrib.state) {
 		spin_unlock_irqrestore(&msi_lock, flags);
 		printk(KERN_WARNING "PCI: %s: pci_disable_msi() called without "
@@ -768,7 +771,7 @@ void pci_disable_msi(struct pci_dev* dev
 		       pci_name(dev), dev->irq);
 		BUG_ON(entry->msi_attrib.state > 0);
 	} else {
-		vector_irq[dev->irq] = 0; /* free it */
+		vector_irq[node][dev->irq] = 0; /* free it */
 		nr_released_vectors++;
 		default_vector = entry->msi_attrib.default_vector;
 		spin_unlock_irqrestore(&msi_lock, flags);
@@ -782,7 +785,7 @@ void pci_disable_msi(struct pci_dev* dev
 static int msi_free_vector(struct pci_dev* dev, int vector, int reassign)
 {
 	struct msi_desc *entry;
-	int head, entry_nr, type;
+	int head, entry_nr, type, node;
 	void __iomem *base;
 	unsigned long flags;
 
@@ -792,6 +795,7 @@ static int msi_free_vector(struct pci_de
 		spin_unlock_irqrestore(&msi_lock, flags);
 		return -EINVAL;
 	}
+	node = pcibus_to_node(dev->bus->number);
 	type = entry->msi_attrib.type;
 	entry_nr = entry->msi_attrib.entry_nr;
 	head = entry->link.head;
@@ -800,7 +804,7 @@ static int msi_free_vector(struct pci_de
 	msi_desc[entry->link.tail]->link.head = entry->link.head;
 	entry->dev = NULL;
 	if (!reassign) {
-		vector_irq[vector] = 0;
+		vector_irq[node][vector] = 0;
 		nr_released_vectors++;
 	}
 	msi_desc[vector] = NULL;
@@ -844,7 +848,7 @@ static int msi_free_vector(struct pci_de
 static int reroute_msix_table(int head, struct msix_entry *entries, int *nvec)
 {
 	int vector = head, tail = 0;
-	int i, j = 0, nr_entries = 0;
+	int i, j = 0, nr_entries = 0, node = 0;
 	void __iomem *base;
 	unsigned long flags;
 
@@ -865,7 +869,7 @@ static int reroute_msix_table(int head, 
 	for (i = 0; i < *nvec; i++) {
 		j = msi_desc[vector]->msi_attrib.entry_nr;
 		msi_desc[vector]->msi_attrib.state = 0;	/* Mark it not active */
-		vector_irq[vector] = -1;		/* Mark it busy */
+		vector_irq[node][vector] = -1;		/* Mark it busy */
 		nr_released_vectors--;
 		entries[i].vector = vector;
 		if (j != (entries + i)->entry) {
@@ -996,7 +1000,7 @@ int pci_enable_msix(struct pci_dev* dev,
 
 void pci_disable_msix(struct pci_dev* dev)
 {
-	int pos, temp;
+	int pos, temp, node;
 	u16 control;
 
    	if (!dev || !(pos = pci_find_capability(dev, PCI_CAP_ID_MSIX)))
@@ -1007,6 +1011,7 @@ void pci_disable_msix(struct pci_dev* de
 		return;
 
 	temp = dev->irq;
+	node = pcibus_to_node(dev->bus->number);
 	if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) {
 		int state, vector, head, tail = 0, warning = 0;
 		unsigned long flags;
@@ -1018,7 +1023,7 @@ void pci_disable_msix(struct pci_dev* de
 			if (state)
 				warning = 1;
 			else {
-				vector_irq[vector] = 0; /* free it */
+				vector_irq[node][vector] = 0; /* free it */
 				nr_released_vectors++;
 			}
 			tail = msi_desc[vector]->link.tail;
Index: linux-2.6.13-rc4-mm1/drivers/pci/msi.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/drivers/pci/msi.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 msi.h
--- linux-2.6.13-rc4-mm1/drivers/pci/msi.h	6 Aug 2005 18:46:32 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/drivers/pci/msi.h	11 Aug 2005 06:02:30 -0000
@@ -18,8 +18,7 @@
  */
 #define NR_HP_RESERVED_VECTORS 	20
 
-extern int vector_irq[NR_VECTORS];
-extern void (*interrupt[NR_IRQS])(void);
+extern int vector_irq[NR_IRQ_NODES][NR_VECTORS];
 extern int pci_vector_resources(int last, int nr_released);
 
 #ifdef CONFIG_SMP
Index: linux-2.6.13-rc4-mm1/include/asm-i386/cpu.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/cpu.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 cpu.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/cpu.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/cpu.h	11 Aug 2005 03:51:22 -0000
@@ -17,5 +17,7 @@ extern int arch_register_cpu(int num);
 extern void arch_unregister_cpu(int);
 #endif
 
+extern void __devinit setup_idts(void);
+
 DECLARE_PER_CPU(int, cpu_state);
 #endif /* _ASM_I386_CPU_H_ */
Index: linux-2.6.13-rc4-mm1/include/asm-i386/desc.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/desc.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 desc.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/desc.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/desc.h	11 Aug 2005 03:42:25 -0000
@@ -2,6 +2,7 @@
 #define __ARCH_DESC_H
 
 #include <asm/ldt.h>
+#include <asm/numnodes.h>
 #include <asm/segment.h>
 
 #define CPU_16BIT_STACK_SIZE 1024
@@ -15,6 +16,9 @@
 #include <asm/mmu.h>
 
 extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
+extern struct desc_struct *cpu_idt_table;
+extern struct desc_struct boot_idt_table[IDT_ENTRIES];
+
 DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
 
 DECLARE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
@@ -25,7 +29,7 @@ struct Xgt_desc_struct {
 	unsigned short pad;
 } __attribute__ ((packed));
 
-extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
+extern struct Xgt_desc_struct cpu_idt_descr[NR_CPUS], cpu_gdt_descr[NR_CPUS];
 
 #define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
@@ -45,7 +49,9 @@ extern struct Xgt_desc_struct idt_descr,
  * something other than this.
  */
 extern struct desc_struct default_ldt[];
-extern void set_intr_gate(unsigned int irq, void * addr);
+extern void set_intr_gate(unsigned int vector, void * addr);
+extern void __init boot_set_intr_gate(unsigned int vector, void *addr);
+extern void node_set_intr_gate(unsigned int node, unsigned int vector, void * addr);
 
 #define _set_tssldt_desc(n,addr,limit,type) \
 __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
Index: linux-2.6.13-rc4-mm1/include/asm-i386/hw_irq.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/hw_irq.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 hw_irq.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/hw_irq.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/hw_irq.h	7 Aug 2005 00:40:40 -0000
@@ -29,7 +29,7 @@ extern u8 irq_vector[NR_IRQ_VECTORS];
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
 #define AUTO_ASSIGN		-1
 
-extern void (*interrupt[NR_IRQS])(void);
+extern char interrupt[NR_IRQS][IRQ_STUB_SIZE];
 
 #ifdef CONFIG_SMP
 fastcall void reschedule_interrupt(void);
Index: linux-2.6.13-rc4-mm1/include/asm-i386/io_apic.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/io_apic.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 io_apic.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/io_apic.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/io_apic.h	7 Aug 2005 00:40:40 -0000
@@ -208,6 +208,6 @@ extern int (*ioapic_renumber_irq)(int io
 #define io_apic_assign_pci_irqs 0
 #endif
 
-extern int assign_irq_vector(int irq);
+extern int assign_irq_vector(int irq, int node);
 
 #endif
Index: linux-2.6.13-rc4-mm1/include/asm-i386/segment.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/segment.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 segment.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/segment.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/segment.h	7 Aug 2005 00:40:40 -0000
@@ -97,5 +97,5 @@
  * of tasks we can have..
  */
 #define IDT_ENTRIES 256
-
+#define IDT_SIZE (IDT_ENTRIES * 8)
 #endif
Index: linux-2.6.13-rc4-mm1/include/asm-i386/smp.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/smp.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 smp.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/smp.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/smp.h	11 Aug 2005 04:17:39 -0000
@@ -59,7 +59,7 @@ extern void cpu_uninit(void);
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
-#define cpu_possible_map cpu_callout_map
+extern cpumask_t cpu_possible_map;
 
 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
 static inline int num_booting_cpus(void)
Index: linux-2.6.13-rc4-mm1/include/asm-i386/mach-default/irq_vectors_limits.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/mach-default/irq_vectors_limits.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 irq_vectors_limits.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/mach-default/irq_vectors_limits.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/mach-default/irq_vectors_limits.h	11 Aug 2005 05:31:10 -0000
@@ -2,11 +2,15 @@
 #define _ASM_IRQ_VECTORS_LIMITS_H
 
 #ifdef CONFIG_PCI_MSI
-#define NR_IRQS FIRST_SYSTEM_VECTOR
+#define NR_IRQS 224
+#define IRQ_STUB_SIZE 16
 #define NR_IRQ_VECTORS NR_IRQS
+#define NR_IRQ_NODES	MAX_NUMNODES
 #else
 #ifdef CONFIG_X86_IO_APIC
 #define NR_IRQS 224
+#define IRQ_STUB_SIZE 16
+#define NR_IRQ_NODES	MAX_NUMNODES
 # if (224 >= 32 * NR_CPUS)
 # define NR_IRQ_VECTORS NR_IRQS
 # else
@@ -14,8 +18,13 @@
 # endif
 #else
 #define NR_IRQS 16
+#define IRQ_STUB_SIZE 16
+#define NR_IRQ_NODES	1
 #define NR_IRQ_VECTORS NR_IRQS
 #endif
 #endif
 
+/* number of vectors available for external interrupts in Linux */
+#define NR_DEVICE_VECTORS	190
+
 #endif /* _ASM_IRQ_VECTORS_LIMITS_H */
Index: linux-2.6.13-rc4-mm1/include/asm-i386/mach-visws/irq_vectors.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/mach-visws/irq_vectors.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 irq_vectors.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/mach-visws/irq_vectors.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/mach-visws/irq_vectors.h	7 Aug 2005 00:40:40 -0000
@@ -52,7 +52,10 @@
  */
 #define NR_VECTORS 256
 #define NR_IRQS 224
+#define IRQ_STUB_SIZE 16
 #define NR_IRQ_VECTORS NR_IRQS
+/* number of vectors available for external interrupts in Linux */
+#define NR_DEVICE_VECTORS	190
 
 #define FPU_IRQ			13
 
Index: linux-2.6.13-rc4-mm1/include/asm-i386/mach-voyager/irq_vectors.h
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/include/asm-i386/mach-voyager/irq_vectors.h,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 irq_vectors.h
--- linux-2.6.13-rc4-mm1/include/asm-i386/mach-voyager/irq_vectors.h	6 Aug 2005 18:46:47 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/include/asm-i386/mach-voyager/irq_vectors.h	7 Aug 2005 00:40:40 -0000
@@ -57,7 +57,10 @@
 
 #define NR_VECTORS 256
 #define NR_IRQS 224
+#define IRQ_STUB_SIZE 16
 #define NR_IRQ_VECTORS NR_IRQS
+/* number of vectors available for external interrupts in Linux */
+#define NR_DEVICE_VECTORS	190
 
 #define FPU_IRQ				13
 
Index: linux-2.6.13-rc4-mm1/init/main.c
===================================================================
RCS file: /home/cvsroot/linux-2.6.13-rc4-mm1/init/main.c,v
retrieving revision 1.1.1.1
diff -u -p -B -r1.1.1.1 main.c
--- linux-2.6.13-rc4-mm1/init/main.c	6 Aug 2005 18:46:53 -0000	1.1.1.1
+++ linux-2.6.13-rc4-mm1/init/main.c	11 Aug 2005 03:12:59 -0000
@@ -301,10 +301,13 @@ extern void setup_arch(char **);
 
 #ifndef CONFIG_SMP
 
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86
 static void __init smp_init(void)
 {
+	setup_idts();
+#ifdef CONFIG_X86_LOCAL_APIC
 	APIC_init_uniprocessor();
+#endif
 }
 #else
 #define smp_init()	do { } while (0)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-11 21:55 Protasevich, Natalie
@ 2005-08-12  1:07 ` James Cleverdon
  2005-08-12  2:59 ` James Cleverdon
  1 sibling, 0 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-12  1:07 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: Andi Kleen, Russ Weight, linux-kernel

On Thursday 11 August 2005 02:55 pm, Protasevich, Natalie wrote:
> > After sleeping on it, maybe the original code can be patched 
> > without having to hack assign_irq_vector(), etc.  How about:
> > 
> > --- io_apic.c	2005-08-11 10:14:33.564748923 -0700
> > +++ io_apic.c.new	2005-08-11 10:15:55.412331115 -0700
> > @@ -617,7 +617,7 @@ int gsi_irq_sharing(int gsi)
> >  	 * than PCI.
> >  	 */
> >  	for (i = 0; i < NR_IRQS; i++)
> > -		if (IO_APIC_VECTOR(i) == vector) {
> > +		if (IO_APIC_VECTOR(i) == vector && i != gsi) {
> >  			if (!platform_legacy_irq(i))
> >  				break;			/* got one */
> >  			IO_APIC_VECTOR(gsi) = 0;
> > 
> > 
> Yes that did it, on my small system it looked just right:
> 
> <7>IRQ to pin mappings:
> <7>IRQ0 -> 0:2
> <7>IRQ1 -> 0:1
> <7>IRQ3 -> 0:3
> <7>IRQ4 -> 0:4
> <7>IRQ5 -> 0:5
> <7>IRQ6 -> 0:6
> <7>IRQ7 -> 0:7
> <7>IRQ8 -> 0:8
> <7>IRQ9 -> 0:9
> <7>IRQ10 -> 0:10
> <7>IRQ11 -> 0:11
> <7>IRQ12 -> 0:12
> <7>IRQ14 -> 0:14
> <7>IRQ15 -> 0:15
> <7>IRQ16 -> 0:16
> <7>IRQ17 -> 0:17
> <7>IRQ18 -> 0:18
> <7>IRQ19 -> 0:19
> <7>IRQ20 -> 0:20
> <7>IRQ21 -> 0:23
> <7>IRQ22 -> 1:2
> <7>IRQ23 -> 1:3
> <7>IRQ24 -> 1:4
> <7>IRQ25 -> 1:5
> <7>IRQ26 -> 2:0
> <7>IRQ27 -> 2:1
> <7>IRQ28 -> 2:2
> <7>IRQ29 -> 2:3
> <7>IRQ30 -> 2:4
> <7>IRQ31 -> 2:5
> <7>IRQ32 -> 2:6
> <7>IRQ33 -> 2:7
> <7>IRQ34 -> 2:8
> :!cat /proc/interrupts
>            CPU0       CPU1       CPU2       CPU3
>   0:      12621      15007      12781      20921    IO-APIC-edge  timer
>   1:         72          0          2        175    IO-APIC-edge  i8042
>   2:          0          0          0          0          XT-PIC
> cascade
>   8:          0          0          0          1    IO-APIC-edge  rtc
>   9:          0          0          0          0    IO-APIC-edge  acpi
>  12:          4        272          0        110    IO-APIC-edge  i8042
>  15:          4          0          0         39    IO-APIC-edge  ide1
>  16:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb1, uhci_hcd:usb4
>  17:          0          0          0          2   IO-APIC-level
> ohci1394
>  18:        730       2407        932       2083   IO-APIC-level
> libata, uhci_hcd:usb3
>  19:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb2
>  21:          0          0          0          0   IO-APIC-level
> ehci_hcd:usb5
>  26:        416          0          0          4   IO-APIC-level  eth0
> NMI:        116         71         73         51
> LOC:      61280      61258      61236      61214
> ERR:          3
> MIS:          0
> 
> Looks good! I will try the patch also on the ES7000 hopefully big enough
> to exercise some vector sharing.
> 
> Regards,
> --Natalie


No, my quick fix still has some enumeration problems.  Suppose
gsi_irq_sharing has already handed out IRQ 16, then it is called with
GSI 16?  We'd call assign_irq_vector(16), which would clobber
irq_vector[16].  Not good.  We need to avoid assign_irq_vector's
habit of storing the vector in irq_vector until we commit to a
particular IRQ number.

Here's a quick and ugly kludge:

--- 2.6.12.3/arch/x86_64/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/io_apic.c	2005-08-11 17:15:39.000000000 -0700
@@ -581,6 +586,69 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+static int next_irq = 16;
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+	u8 saved_vector;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	saved_vector = IO_APIC_VECTOR(gsi);	/* Kludge:  Need to make assign_irq_vector not always store vector in irq_vector */
+	vector = assign_irq_vector(gsi);
+	IO_APIC_VECTOR(gsi) = saved_vector;	/* Rest of Kludge */
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0) {
+			IO_APIC_VECTOR(i) = 0;
+			goto try_again;
+		}
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(/*KERN_INFO*/ KERN_ERR "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	printk(/*KERN_INFO*/ KERN_ERR "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;


I suppose the real solution might be an extra argument to
assign_irq_vector to tell it whether to save the vector or not.

What do you think?

-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-11 21:55 Protasevich, Natalie
  2005-08-12  1:07 ` James Cleverdon
@ 2005-08-12  2:59 ` James Cleverdon
  1 sibling, 0 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-12  2:59 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: Andi Kleen, Russ Weight, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2963 bytes --]

The attached hack to assign_irq_vector may be marginally less ugly.
However, I haven't rearranged the code like Andi wanted yet.


On Thursday 11 August 2005 02:55 pm, Protasevich, Natalie wrote:
> > After sleeping on it, maybe the original code can be patched 
> > without having to hack assign_irq_vector(), etc.  How about:
> > 
> > --- io_apic.c	2005-08-11 10:14:33.564748923 -0700
> > +++ io_apic.c.new	2005-08-11 10:15:55.412331115 -0700
> > @@ -617,7 +617,7 @@ int gsi_irq_sharing(int gsi)
> >  	 * than PCI.
> >  	 */
> >  	for (i = 0; i < NR_IRQS; i++)
> > -		if (IO_APIC_VECTOR(i) == vector) {
> > +		if (IO_APIC_VECTOR(i) == vector && i != gsi) {
> >  			if (!platform_legacy_irq(i))
> >  				break;			/* got one */
> >  			IO_APIC_VECTOR(gsi) = 0;
> > 
> > 
> Yes that did it, on my small system it looked just right:
> 
> <7>IRQ to pin mappings:
> <7>IRQ0 -> 0:2
> <7>IRQ1 -> 0:1
> <7>IRQ3 -> 0:3
> <7>IRQ4 -> 0:4
> <7>IRQ5 -> 0:5
> <7>IRQ6 -> 0:6
> <7>IRQ7 -> 0:7
> <7>IRQ8 -> 0:8
> <7>IRQ9 -> 0:9
> <7>IRQ10 -> 0:10
> <7>IRQ11 -> 0:11
> <7>IRQ12 -> 0:12
> <7>IRQ14 -> 0:14
> <7>IRQ15 -> 0:15
> <7>IRQ16 -> 0:16
> <7>IRQ17 -> 0:17
> <7>IRQ18 -> 0:18
> <7>IRQ19 -> 0:19
> <7>IRQ20 -> 0:20
> <7>IRQ21 -> 0:23
> <7>IRQ22 -> 1:2
> <7>IRQ23 -> 1:3
> <7>IRQ24 -> 1:4
> <7>IRQ25 -> 1:5
> <7>IRQ26 -> 2:0
> <7>IRQ27 -> 2:1
> <7>IRQ28 -> 2:2
> <7>IRQ29 -> 2:3
> <7>IRQ30 -> 2:4
> <7>IRQ31 -> 2:5
> <7>IRQ32 -> 2:6
> <7>IRQ33 -> 2:7
> <7>IRQ34 -> 2:8
> :!cat /proc/interrupts
>            CPU0       CPU1       CPU2       CPU3
>   0:      12621      15007      12781      20921    IO-APIC-edge  timer
>   1:         72          0          2        175    IO-APIC-edge  i8042
>   2:          0          0          0          0          XT-PIC
> cascade
>   8:          0          0          0          1    IO-APIC-edge  rtc
>   9:          0          0          0          0    IO-APIC-edge  acpi
>  12:          4        272          0        110    IO-APIC-edge  i8042
>  15:          4          0          0         39    IO-APIC-edge  ide1
>  16:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb1, uhci_hcd:usb4
>  17:          0          0          0          2   IO-APIC-level
> ohci1394
>  18:        730       2407        932       2083   IO-APIC-level
> libata, uhci_hcd:usb3
>  19:          0          0          0          0   IO-APIC-level
> uhci_hcd:usb2
>  21:          0          0          0          0   IO-APIC-level
> ehci_hcd:usb5
>  26:        416          0          0          4   IO-APIC-level  eth0
> NMI:        116         71         73         51
> LOC:      61280      61258      61236      61214
> ERR:          3
> MIS:          0
> 
> Looks good! I will try the patch also on the ES7000 hopefully big enough
> to exercise some vector sharing.
> 
> Regards,
> --Natalie
> 
> 
> 

-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

[-- Attachment #2: vect_share_irq_2005-08-11_2.6.12.3 --]
[-- Type: text/x-diff, Size: 6884 bytes --]

diff -pru 2.6.12.3/arch/i386/kernel/acpi/boot.c z12.3/arch/i386/kernel/acpi/boot.c
--- 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/acpi/boot.c	2005-08-11 19:27:46.000000000 -0700
@@ -42,6 +42,7 @@
 static inline void  acpi_madt_oem_check(char *oem_id, char *oem_table_id) { }
 extern void __init clustered_apic_check(void);
 static inline int ioapic_setup_disabled(void) { return 0; }
+extern int gsi_irq_sharing(int gsi);
 #include <asm/proto.h>
 
 #else	/* X86 */
@@ -51,6 +52,9 @@ static inline int ioapic_setup_disabled(
 #include <mach_mpparse.h>
 #endif	/* CONFIG_X86_LOCAL_APIC */
 
+static inline int gsi_irq_sharing(int gsi) { return gsi; }
+
+
 #endif	/* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (					    \
@@ -453,7 +457,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned in
  		*irq = IO_APIC_VECTOR(gsi);
 	else
 #endif
-		*irq = gsi;
+		*irq = gsi_irq_sharing(gsi);
 	return 0;
 }
 
diff -pru 2.6.12.3/arch/x86_64/kernel/io_apic.c z12.3/arch/x86_64/kernel/io_apic.c
--- 2.6.12.3/arch/x86_64/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/io_apic.c	2005-08-11 19:32:28.000000000 -0700
@@ -56,7 +56,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -88,6 +88,7 @@ static void add_pin_to_irq(unsigned int 
 	static int first_free_entry = NR_IRQS;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 
+	BUG_ON(irq >= NR_IRQS);
 	while (entry->next)
 		entry = irq_2_pin + entry->next;
 
@@ -95,7 +96,7 @@ static void add_pin_to_irq(unsigned int 
 		entry->next = first_free_entry;
 		entry = irq_2_pin + entry->next;
 		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: whoops");
+			panic("io_apic.c: ran out of irq_2_pin entries!");
 	}
 	entry->apic = apic;
 	entry->pin = pin;
@@ -581,6 +582,69 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+static int __assign_irq_vector(int irq);
+
+static int next_irq = 16;
+
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	vector = __assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0)
+			goto try_again;
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	vector_irq[vector] = i;
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -610,6 +674,7 @@ static int pin_2_irq(int idx, int apic, 
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -619,6 +684,7 @@ static int pin_2_irq(int idx, int apic, 
 			break;
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 
 	/*
 	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -634,6 +700,7 @@ static int pin_2_irq(int idx, int apic, 
 			}
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 	return irq;
 }
 
@@ -657,12 +724,12 @@ static inline int IO_APIC_irq_trigger(in
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (IO_APIC_VECTOR(irq) > 0)
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
 	current_vector += 8;
@@ -670,17 +737,24 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	vector_irq[current_vector] = irq;
+	return current_vector;
+}
+
+int assign_irq_vector(int irq)
+{
+	int vect;
+
+	vect = __assign_irq_vector(irq);
+	vector_irq[vect] = irq;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vect;
 
-	return current_vector;
+	return vect;
 }
 
 extern void (*interrupt[NR_IRQS])(void);
@@ -1866,6 +1940,7 @@ int io_apic_set_pci_routing (int ioapic,
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff -pru 2.6.12.3/arch/x86_64/kernel/mpparse.c z12.3/arch/x86_64/kernel/mpparse.c
--- 2.6.12.3/arch/x86_64/kernel/mpparse.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/mpparse.c	2005-08-11 19:34:53.000000000 -0700
@@ -214,7 +214,7 @@ static void __init MP_intsrc_info (struc
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff -pru 2.6.12.3/include/asm-x86_64/mpspec.h z12.3/include/asm-x86_64/mpspec.h
--- 2.6.12.3/include/asm-x86_64/mpspec.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-x86_64/mpspec.h	2005-08-10 17:08:45.000000000 -0700
@@ -157,7 +157,8 @@ struct mpc_config_lintsrc
  */
 
 #define MAX_MP_BUSSES 256
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-04  9:22     ` Andi Kleen
@ 2005-08-15  2:57       ` James Cleverdon
  2005-08-15  5:55         ` Zwane Mwaikambo
  2005-08-15 17:44         ` Andi Kleen
  0 siblings, 2 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-15  2:57 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Protasevich, Natalie, Russ Weight, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3042 bytes --]

On Thursday 04 August 2005 02:22 am, Andi Kleen wrote:
> On Thu, Aug 04, 2005 at 12:05:50AM -0700, James Cleverdon wrote:
> > diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c
> > n12.3/arch/i386/kernel/acpi/boot.c ---
> > 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000
> > -0700 +++ n12.3/arch/i386/kernel/acpi/boot.c	2005-08-04
> > 00:01:10.199710211 -0700 @@ -42,6 +42,7 @@
> >  static inline void  acpi_madt_oem_check(char *oem_id, char
> > *oem_table_id) { } extern void __init clustered_apic_check(void);
> >  static inline int ioapic_setup_disabled(void) { return 0; }
> > +extern int gsi_irq_sharing(int gsi);
> >  #include <asm/proto.h>
> >
> >  #else	/* X86 */
> > @@ -51,6 +52,9 @@ static inline int ioapic_setup_disabled(
> >  #include <mach_mpparse.h>
> >  #endif	/* CONFIG_X86_LOCAL_APIC */
> >
> > +static inline int gsi_irq_sharing(int gsi) { return gsi; }
>
> Why is this different for i386/x86-64? It shouldn't.

True.  Have added code for i386.  Unfortunately, I didn't see one file 
that is shared by both architectures and which is included when 
building with I/O APIC support.  So, I duplicated the function into 
io_apic.c

> As a unrelated note we really need to get rid of this whole ifdef
> block.
>
> > +++ n12.3/arch/x86_64/Kconfig	2005-08-03 21:31:07.487451167 -0700
> > @@ -280,13 +280,13 @@ config HAVE_DEC_LOCK
> >  	default y
> >
> >  config NR_CPUS
> > -	int "Maximum number of CPUs (2-256)"
> > -	range 2 256
> > +	int "Maximum number of CPUs (2-255)"
> > +	range 2 255
> >  	depends on SMP
> > -	default "8"
> > +	default "16"
>
> Don't change the default please.
>
> > +static int next_irq = 16;
>
> Won't this need a lock for hotplug later?

That's what I thought originally, but maybe not.  We initialize all RTEs 
and assign IRQs+vectors fairly early in boot, plus store the results in 
arrays.  Thereafter the functions just return the preallocated values.

Hmmm...  Since the I/O APIC init comes after the other CPUs are brought 
online, and since I don't understand all that the MSI driver is trying 
to accomplish, it might be safer to use a spin lock anyway.

> > +
> > + retry_vector:
> > +	vector = assign_irq_vector(gsi);
> > +
> > +	/*
> > +	 * Sharing vectors means sharing IRQs, so scan irq_vectors for
> > previous +	 * use of vector and if found, return that IRQ. 
> > However, we never want +	 * to share legacy IRQs, which usually
> > have a different trigger mode +	 * than PCI.
> > +	 */
>
> Can we perhaps force such sharing early temporarily even when the
> table is not filled up?  This way we would get better test coverage
> of all of  this.
>
> That would be later disabled of course.

Suppose I added a static counter and pretended that every third 
non-legacy IRQ needed to be shared?

> Rest looks ok to me.
>
> -Andi

Sigh.  Have to attach the file again.  Sorry about that.

Signed-off-by:  James Cleverdon <jamesclv@us.ibm.com>

-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm


[-- Attachment #2: vect_share_irq_2005-08-14_2.6.12.3 --]
[-- Type: text/x-diff, Size: 12870 bytes --]

diff -pru 2.6.12.3/arch/i386/kernel/acpi/boot.c z12.3/arch/i386/kernel/acpi/boot.c
--- 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/acpi/boot.c	2005-08-14 15:40:36.000000000 -0700
@@ -453,7 +453,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned in
  		*irq = IO_APIC_VECTOR(gsi);
 	else
 #endif
-		*irq = gsi;
+		*irq = gsi_irq_sharing(gsi);
 	return 0;
 }
 
diff -pru 2.6.12.3/arch/i386/kernel/io_apic.c z12.3/arch/i386/kernel/io_apic.c
--- 2.6.12.3/arch/i386/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/io_apic.c	2005-08-14 17:33:46.000000000 -0700
@@ -62,7 +62,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -1041,6 +1041,74 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+static int __assign_irq_vector(int irq);
+
+static int next_irq = 16;
+
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
+static DEFINE_SPINLOCK(gsi_irq_lock);
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+	unsigned long flags;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	vector = __assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0)
+			goto try_again;
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	spin_lock_irqsave(&gsi_irq_lock, flags);
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	vector_irq[vector] = i;
+	spin_unlock_irqrestore(&gsi_irq_lock, flags);
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -1071,6 +1139,7 @@ static int pin_2_irq(int idx, int apic, 
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 
 			/*
 			 * For MPS mode, so far only needed by ES7000 platform
@@ -1127,11 +1196,11 @@ static inline int IO_APIC_irq_trigger(in
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
 	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
@@ -1140,17 +1209,24 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	vector_irq[current_vector] = irq;
+	return current_vector;
+}
+
+int assign_irq_vector(int irq)
+{
+	int vect;
+
+	vect = __assign_irq_vector(irq);
+	vector_irq[vect] = irq;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vect;
 
-	return current_vector;
+	return vect;
 }
 
 static struct hw_interrupt_type ioapic_level_type;
@@ -2514,6 +2590,7 @@ int io_apic_set_pci_routing (int ioapic,
 	entry.polarity = active_high_low;
 	entry.mask  = 1;
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff -pru 2.6.12.3/arch/i386/kernel/mpparse.c z12.3/arch/i386/kernel/mpparse.c
--- 2.6.12.3/arch/i386/kernel/mpparse.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/mpparse.c	2005-08-14 15:33:52.000000000 -0700
@@ -274,7 +274,7 @@ static void __init MP_intsrc_info (struc
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff -pru 2.6.12.3/arch/x86_64/kernel/io_apic.c z12.3/arch/x86_64/kernel/io_apic.c
--- 2.6.12.3/arch/x86_64/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/io_apic.c	2005-08-14 16:45:13.000000000 -0700
@@ -56,7 +56,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -88,6 +88,7 @@ static void add_pin_to_irq(unsigned int 
 	static int first_free_entry = NR_IRQS;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 
+	BUG_ON(irq >= NR_IRQS);
 	while (entry->next)
 		entry = irq_2_pin + entry->next;
 
@@ -95,7 +96,7 @@ static void add_pin_to_irq(unsigned int 
 		entry->next = first_free_entry;
 		entry = irq_2_pin + entry->next;
 		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: whoops");
+			panic("io_apic.c: ran out of irq_2_pin entries!");
 	}
 	entry->apic = apic;
 	entry->pin = pin;
@@ -581,6 +582,74 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+static int __assign_irq_vector(int irq);
+
+static int next_irq = 16;
+
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
+static DEFINE_SPINLOCK(gsi_irq_lock);
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+	unsigned long flags;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	vector = __assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0)
+			goto try_again;
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	spin_lock_irqsave(&gsi_irq_lock, flags);
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	vector_irq[vector] = i;
+	spin_unlock_irqrestore(&gsi_irq_lock, flags);
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -610,6 +679,7 @@ static int pin_2_irq(int idx, int apic, 
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -619,6 +689,7 @@ static int pin_2_irq(int idx, int apic, 
 			break;
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 
 	/*
 	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -634,6 +705,7 @@ static int pin_2_irq(int idx, int apic, 
 			}
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 	return irq;
 }
 
@@ -657,12 +729,12 @@ static inline int IO_APIC_irq_trigger(in
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-int assign_irq_vector(int irq)
+static int __assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (IO_APIC_VECTOR(irq) > 0)
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
 	current_vector += 8;
@@ -670,17 +742,24 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	vector_irq[current_vector] = irq;
+	return current_vector;
+}
+
+int assign_irq_vector(int irq)
+{
+	int vect;
+
+	vect = __assign_irq_vector(irq);
+	vector_irq[vect] = irq;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vect;
 
-	return current_vector;
+	return vect;
 }
 
 extern void (*interrupt[NR_IRQS])(void);
@@ -1866,6 +1945,7 @@ int io_apic_set_pci_routing (int ioapic,
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff -pru 2.6.12.3/arch/x86_64/kernel/mpparse.c z12.3/arch/x86_64/kernel/mpparse.c
--- 2.6.12.3/arch/x86_64/kernel/mpparse.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/mpparse.c	2005-08-11 19:34:53.000000000 -0700
@@ -214,7 +214,7 @@ static void __init MP_intsrc_info (struc
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff -pru 2.6.12.3/include/asm-i386/apic.h z12.3/include/asm-i386/apic.h
--- 2.6.12.3/include/asm-i386/apic.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-i386/apic.h	2005-08-14 15:34:49.000000000 -0700
@@ -108,6 +108,7 @@ extern void nmi_watchdog_tick (struct pt
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
+extern int gsi_irq_sharing(int gsi);
 
 extern void enable_NMI_through_LVT0 (void * dummy);
 
diff -pru 2.6.12.3/include/asm-i386/mach-generic/mach_mpspec.h z12.3/include/asm-i386/mach-generic/mach_mpspec.h
--- 2.6.12.3/include/asm-i386/mach-generic/mach_mpspec.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-i386/mach-generic/mach_mpspec.h	2005-08-14 15:39:10.000000000 -0700
@@ -1,7 +1,8 @@
 #ifndef __ASM_MACH_MPSPEC_H
 #define __ASM_MACH_MPSPEC_H
 
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 
 /* Summit or generic (i.e. installer) kernels need lots of bus entries. */
 /* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
diff -pru 2.6.12.3/include/asm-x86_64/apic.h z12.3/include/asm-x86_64/apic.h
--- 2.6.12.3/include/asm-x86_64/apic.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-x86_64/apic.h	2005-08-14 15:36:51.000000000 -0700
@@ -98,6 +98,7 @@ extern int APIC_init_uniprocessor (void)
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 extern void clustered_apic_check(void);
+extern int gsi_irq_sharing(int gsi);
 
 extern void nmi_watchdog_default(void);
 extern int setup_nmi_watchdog(char *);
diff -pru 2.6.12.3/include/asm-x86_64/mpspec.h z12.3/include/asm-x86_64/mpspec.h
--- 2.6.12.3/include/asm-x86_64/mpspec.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-x86_64/mpspec.h	2005-08-10 17:08:45.000000000 -0700
@@ -157,7 +157,8 @@ struct mpc_config_lintsrc
  */
 
 #define MAX_MP_BUSSES 256
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [RFC][2.6.12.3] IRQ compression/sharing patch
@ 2005-08-15  4:35 Protasevich, Natalie
  2005-08-15 17:11 ` James Cleverdon
  0 siblings, 1 reply; 21+ messages in thread
From: Protasevich, Natalie @ 2005-08-15  4:35 UTC (permalink / raw)
  To: jamesclv, Andi Kleen; +Cc: Russ Weight, linux-kernel

> On Thursday 04 August 2005 02:22 am, Andi Kleen wrote:
> > On Thu, Aug 04, 2005 at 12:05:50AM -0700, James Cleverdon wrote:
> > > diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c
> > > n12.3/arch/i386/kernel/acpi/boot.c ---
> > > 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 
> 14:18:57.000000000
> > > -0700 +++ n12.3/arch/i386/kernel/acpi/boot.c	2005-08-04
> > > 00:01:10.199710211 -0700 @@ -42,6 +42,7 @@  static inline void  
> > > acpi_madt_oem_check(char *oem_id, char
> > > *oem_table_id) { } extern void __init 
> clustered_apic_check(void);  
> > > static inline int ioapic_setup_disabled(void) { return 0; }
> > > +extern int gsi_irq_sharing(int gsi);
> > >  #include <asm/proto.h>
> > >
> > >  #else	/* X86 */
> > > @@ -51,6 +52,9 @@ static inline int 
> ioapic_setup_disabled(  #include 
> > > <mach_mpparse.h>
> > >  #endif	/* CONFIG_X86_LOCAL_APIC */
> > >
> > > +static inline int gsi_irq_sharing(int gsi) { return gsi; }
> >
> > Why is this different for i386/x86-64? It shouldn't.
> 
> True.  Have added code for i386.  Unfortunately, I didn't see 
> one file that is shared by both architectures and which is 
> included when building with I/O APIC support.  So, I 
> duplicated the function into io_apic.c
> 
> > As a unrelated note we really need to get rid of this whole ifdef 
> > block.
> >
> > > +++ n12.3/arch/x86_64/Kconfig	2005-08-03 
> 21:31:07.487451167 -0700
> > > @@ -280,13 +280,13 @@ config HAVE_DEC_LOCK
> > >  	default y
> > >
> > >  config NR_CPUS
> > > -	int "Maximum number of CPUs (2-256)"
> > > -	range 2 256
> > > +	int "Maximum number of CPUs (2-255)"
> > > +	range 2 255
> > >  	depends on SMP
> > > -	default "8"
> > > +	default "16"
> >
> > Don't change the default please.
> >
> > > +static int next_irq = 16;
> >
> > Won't this need a lock for hotplug later?
> 
> That's what I thought originally, but maybe not.  We 
> initialize all RTEs and assign IRQs+vectors fairly early in 
> boot, plus store the results in arrays.  Thereafter the 
> functions just return the preallocated values.
> 
> Hmmm...  Since the I/O APIC init comes after the other CPUs 
> are brought online, and since I don't understand all that the 
> MSI driver is trying to accomplish, it might be safer to use 
> a spin lock anyway.
> 
> > > +
> > > + retry_vector:
> > > +	vector = assign_irq_vector(gsi);
> > > +
> > > +	/*
> > > +	 * Sharing vectors means sharing IRQs, so scan irq_vectors for
> > > previous +	 * use of vector and if found, return that IRQ. 
> > > However, we never want +	 * to share legacy IRQs, which usually
> > > have a different trigger mode +	 * than PCI.
> > > +	 */
> >
> > Can we perhaps force such sharing early temporarily even when the 
> > table is not filled up?  This way we would get better test 
> coverage of 
> > all of  this.
> >
> > That would be later disabled of course.
> 
> Suppose I added a static counter and pretended that every 
> third non-legacy IRQ needed to be shared?
> 
> > Rest looks ok to me.
> >
> > -Andi
> 
> Sigh.  Have to attach the file again.  Sorry about that.
> 
> Signed-off-by:  James Cleverdon <jamesclv@us.ibm.com>

I think you were going to change this line, which fixed the jumps in the
irq distribution:

--- io_apic.c	2005-08-11 10:14:33.564748923 -0700
+++ io_apic.c.new	2005-08-11 10:15:55.412331115 -0700
@@ -617,7 +617,7 @@ int gsi_irq_sharing(int gsi)
  	 * than PCI.
  	 */
  	for (i = 0; i < NR_IRQS; i++)
 -		if (IO_APIC_VECTOR(i) == vector) {
 +		if (IO_APIC_VECTOR(i) == vector && i != gsi) {
  			if (!platform_legacy_irq(i))
  				break;			/* got one */
  			IO_APIC_VECTOR(gsi) = 0;
But it's not in this version of the patch.
Thanks,
--Natalie 
> --
> James Cleverdon
> IBM LTC (xSeries Linux Solutions)
> {jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm
> 
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-15  2:57       ` James Cleverdon
@ 2005-08-15  5:55         ` Zwane Mwaikambo
  2005-08-15 17:44         ` Andi Kleen
  1 sibling, 0 replies; 21+ messages in thread
From: Zwane Mwaikambo @ 2005-08-15  5:55 UTC (permalink / raw)
  To: James Cleverdon
  Cc: Andi Kleen, Protasevich, Natalie, Russ Weight, linux-kernel

On Sun, 14 Aug 2005, James Cleverdon wrote:

> > > +static int next_irq = 16;
> >
> > Won't this need a lock for hotplug later?
> 
> That's what I thought originally, but maybe not.  We initialize all RTEs 
> and assign IRQs+vectors fairly early in boot, plus store the results in 
> arrays.  Thereafter the functions just return the preallocated values.
> 
> Hmmm...  Since the I/O APIC init comes after the other CPUs are brought 
> online, and since I don't understand all that the MSI driver is trying 
> to accomplish, it might be safer to use a spin lock anyway.

With respect to vector allocation, the MSI driver locks around 
assign_irq_vector, it doesn't look like next_irq is used in that path so 
you shouldn't need a lock if it's only used in single threaded init. This 
of course would change if IOAPICs were added after boot.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-15  4:35 Protasevich, Natalie
@ 2005-08-15 17:11 ` James Cleverdon
  0 siblings, 0 replies; 21+ messages in thread
From: James Cleverdon @ 2005-08-15 17:11 UTC (permalink / raw)
  To: Protasevich, Natalie; +Cc: Andi Kleen, Russ Weight, linux-kernel

On Sunday 14 August 2005 09:35 pm, Protasevich, Natalie wrote:
> > On Thursday 04 August 2005 02:22 am, Andi Kleen wrote:
> > > On Thu, Aug 04, 2005 at 12:05:50AM -0700, James Cleverdon wrote:
[ Snip! ]
> > >
> > > Can we perhaps force such sharing early temporarily even when the 
> > > table is not filled up?  This way we would get better test 
> > coverage of 
> > > all of  this.
> > >
> > > That would be later disabled of course.
> > 
> > Suppose I added a static counter and pretended that every 
> > third non-legacy IRQ needed to be shared?
> > 
> > > Rest looks ok to me.
> > >
> > > -Andi
> > 
> > Sigh.  Have to attach the file again.  Sorry about that.
> > 
> > Signed-off-by:  James Cleverdon <jamesclv@us.ibm.com>
> 
> I think you were going to change this line, which fixed the jumps in the
> irq distribution:
>

Actually, no.  That line fixed the jumps but didn't keep irq_vector[]
values from being splattered in a corner case.  The use of
__assign_irq_vector() should fix that entirely.

Corner case:  GSIs aren't always presented in monotonic order (or at
least, we shouldn't depend on it).  Suppose we had already allocated
IRQ 16 when GSI 16 came along.  The call to assign_irq_vector() would
have the side effect of overwriting irq_vector[16], even though we
would ultimately assign IRQ 17 to the GSI.

Better to not change any global state using __assign_irq_vector until
we're sure which IRQ will be used.


> --- io_apic.c	2005-08-11 10:14:33.564748923 -0700
> +++ io_apic.c.new	2005-08-11 10:15:55.412331115 -0700
> @@ -617,7 +617,7 @@ int gsi_irq_sharing(int gsi)
>   	 * than PCI.
>   	 */
>   	for (i = 0; i < NR_IRQS; i++)
>  -		if (IO_APIC_VECTOR(i) == vector) {
>  +		if (IO_APIC_VECTOR(i) == vector && i != gsi) {
>   			if (!platform_legacy_irq(i))
>   				break;			/* got one */
>   			IO_APIC_VECTOR(gsi) = 0;
> But it's not in this version of the patch.
> Thanks,
> --Natalie
> > --
> > James Cleverdon
> > IBM LTC (xSeries Linux Solutions)
> > {jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm
> > 


-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-15  2:57       ` James Cleverdon
  2005-08-15  5:55         ` Zwane Mwaikambo
@ 2005-08-15 17:44         ` Andi Kleen
  2005-08-16  3:24           ` James Cleverdon
  1 sibling, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2005-08-15 17:44 UTC (permalink / raw)
  To: James Cleverdon
  Cc: Andi Kleen, Protasevich, Natalie, Russ Weight, linux-kernel

On Sun, Aug 14, 2005 at 07:57:53PM -0700, James Cleverdon wrote:
> On Thursday 04 August 2005 02:22 am, Andi Kleen wrote:
> > On Thu, Aug 04, 2005 at 12:05:50AM -0700, James Cleverdon wrote:
> > > diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c
> > > n12.3/arch/i386/kernel/acpi/boot.c ---
> > > 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000
> > > -0700 +++ n12.3/arch/i386/kernel/acpi/boot.c	2005-08-04
> > > 00:01:10.199710211 -0700 @@ -42,6 +42,7 @@
> > >  static inline void  acpi_madt_oem_check(char *oem_id, char
> > > *oem_table_id) { } extern void __init clustered_apic_check(void);
> > >  static inline int ioapic_setup_disabled(void) { return 0; }
> > > +extern int gsi_irq_sharing(int gsi);
> > >  #include <asm/proto.h>
> > >
> > >  #else	/* X86 */
> > > @@ -51,6 +52,9 @@ static inline int ioapic_setup_disabled(
> > >  #include <mach_mpparse.h>
> > >  #endif	/* CONFIG_X86_LOCAL_APIC */
> > >
> > > +static inline int gsi_irq_sharing(int gsi) { return gsi; }
> >
> > Why is this different for i386/x86-64? It shouldn't.
> 
> True.  Have added code for i386.  Unfortunately, I didn't see one file 
> that is shared by both architectures and which is included when 
> building with I/O APIC support.  So, I duplicated the function into 
> io_apic.c

That needs to be cleaned up before merge. This code is already ugly and I don't
want the cruft accumulating here.

> > As a unrelated note we really need to get rid of this whole ifdef
> > block.
> >
> > > +++ n12.3/arch/x86_64/Kconfig	2005-08-03 21:31:07.487451167 -0700
> > > @@ -280,13 +280,13 @@ config HAVE_DEC_LOCK
> > >  	default y
> > >
> > >  config NR_CPUS
> > > -	int "Maximum number of CPUs (2-256)"
> > > -	range 2 256
> > > +	int "Maximum number of CPUs (2-255)"
> > > +	range 2 255
> > >  	depends on SMP
> > > -	default "8"
> > > +	default "16"
> >
> > Don't change the default please.
> >
> > > +static int next_irq = 16;
> >
> > Won't this need a lock for hotplug later?
> 
> That's what I thought originally, but maybe not.  We initialize all RTEs 
> and assign IRQs+vectors fairly early in boot, plus store the results in 
> arrays.  Thereafter the functions just return the preallocated values.

I was thinking of IO-APIC hotplug here. IIRC the ia64 folks
have it already and I'm sure someone will turn up with a patch
for i386/x86-64 soon. For devices it should be ok, you're right.

Ok I guess they can change it in that patch then. Perhaps
just add a comment.

> > > have a different trigger mode +	 * than PCI.
> > > +	 */
> >
> > Can we perhaps force such sharing early temporarily even when the
> > table is not filled up?  This way we would get better test coverage
> > of all of  this.
> >
> > That would be later disabled of course.
> 
> Suppose I added a static counter and pretended that every third 
> non-legacy IRQ needed to be shared?

Can you drop into the sharing path unconditionally?

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-15 17:44         ` Andi Kleen
@ 2005-08-16  3:24           ` James Cleverdon
  2005-08-16  6:58             ` Andi Kleen
  0 siblings, 1 reply; 21+ messages in thread
From: James Cleverdon @ 2005-08-16  3:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Protasevich, Natalie, Russ Weight, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3631 bytes --]

On Monday 15 August 2005 10:44 am, Andi Kleen wrote:
> On Sun, Aug 14, 2005 at 07:57:53PM -0700, James Cleverdon wrote:
> > On Thursday 04 August 2005 02:22 am, Andi Kleen wrote:
> > > On Thu, Aug 04, 2005 at 12:05:50AM -0700, James Cleverdon wrote:
> > > > diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c
> > > > n12.3/arch/i386/kernel/acpi/boot.c ---
> > > > 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000
> > > > -0700 +++ n12.3/arch/i386/kernel/acpi/boot.c	2005-08-04
> > > > 00:01:10.199710211 -0700 @@ -42,6 +42,7 @@
> > > >  static inline void  acpi_madt_oem_check(char *oem_id, char
> > > > *oem_table_id) { } extern void __init clustered_apic_check(void);
> > > >  static inline int ioapic_setup_disabled(void) { return 0; }
> > > > +extern int gsi_irq_sharing(int gsi);
> > > >  #include <asm/proto.h>
> > > >
> > > >  #else	/* X86 */
> > > > @@ -51,6 +52,9 @@ static inline int ioapic_setup_disabled(
> > > >  #include <mach_mpparse.h>
> > > >  #endif	/* CONFIG_X86_LOCAL_APIC */
> > > >
> > > > +static inline int gsi_irq_sharing(int gsi) { return gsi; }
> > >
> > > Why is this different for i386/x86-64? It shouldn't.
> > 
> > True.  Have added code for i386.  Unfortunately, I didn't see one file 
> > that is shared by both architectures and which is included when 
> > building with I/O APIC support.  So, I duplicated the function into 
> > io_apic.c
> 
> That needs to be cleaned up before merge. This code is already ugly and I don't
> want the cruft accumulating here.

OK, I moved the function into a separate file that can be used by
both architectures.

> > > As a unrelated note we really need to get rid of this whole ifdef
> > > block.
> > >
> > > > +++ n12.3/arch/x86_64/Kconfig	2005-08-03 21:31:07.487451167 -0700
> > > > @@ -280,13 +280,13 @@ config HAVE_DEC_LOCK
> > > >  	default y
> > > >
> > > >  config NR_CPUS
> > > > -	int "Maximum number of CPUs (2-256)"
> > > > -	range 2 256
> > > > +	int "Maximum number of CPUs (2-255)"
> > > > +	range 2 255
> > > >  	depends on SMP
> > > > -	default "8"
> > > > +	default "16"
> > >
> > > Don't change the default please.
> > >
> > > > +static int next_irq = 16;
> > >
> > > Won't this need a lock for hotplug later?
> > 
> > That's what I thought originally, but maybe not.  We initialize all RTEs 
> > and assign IRQs+vectors fairly early in boot, plus store the results in 
> > arrays.  Thereafter the functions just return the preallocated values.
> 
> I was thinking of IO-APIC hotplug here. IIRC the ia64 folks
> have it already and I'm sure someone will turn up with a patch
> for i386/x86-64 soon. For devices it should be ok, you're right.
> 
> Ok I guess they can change it in that patch then. Perhaps
> just add a comment.

I've already got a spin lock there, so may as well keep it.

> > > > have a different trigger mode +	 * than PCI.
> > > > +	 */
> > >
> > > Can we perhaps force such sharing early temporarily even when the
> > > table is not filled up?  This way we would get better test coverage
> > > of all of  this.
> > >
> > > That would be later disabled of course.
> > 
> > Suppose I added a static counter and pretended that every third 
> > non-legacy IRQ needed to be shared?
> 
> Can you drop into the sharing path unconditionally?
> 
> -Andi

If no vectors/IRQs are ever allocated, there is nothing to share.
Added some simple minded forced sharing to gsi_irq_sharing.  It
forces 1 in 3 IRQs to be shared.  That should exercise some of the
code paths.

Patch attached.  (Sorry.)

-- 
James Cleverdon
IBM LTC (xSeries Linux Solutions)
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

[-- Attachment #2: vect_share_irq_2005-08-15_2.6.12.3 --]
[-- Type: text/x-diff, Size: 14936 bytes --]

diff -pruN 2.6.12.3/arch/i386/kernel/Makefile z12.3/arch/i386/kernel/Makefile
--- 2.6.12.3/arch/i386/kernel/Makefile	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/Makefile	2005-08-15 15:57:45.000000000 -0700
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.ld
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		doublefault.o quirks.o
+		doublefault.o quirks.o gsi2irq.o
 
 obj-y				+= cpu/
 obj-y				+= timers/
diff -pruN 2.6.12.3/arch/i386/kernel/acpi/boot.c z12.3/arch/i386/kernel/acpi/boot.c
--- 2.6.12.3/arch/i386/kernel/acpi/boot.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/acpi/boot.c	2005-08-14 15:40:36.000000000 -0700
@@ -453,7 +453,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned in
  		*irq = IO_APIC_VECTOR(gsi);
 	else
 #endif
-		*irq = gsi;
+		*irq = gsi_irq_sharing(gsi);
 	return 0;
 }
 
diff -pruN 2.6.12.3/arch/i386/kernel/gsi2irq.c z12.3/arch/i386/kernel/gsi2irq.c
--- 2.6.12.3/arch/i386/kernel/gsi2irq.c	1969-12-31 16:00:00.000000000 -0800
+++ z12.3/arch/i386/kernel/gsi2irq.c	2005-08-15 18:18:24.000000000 -0700
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2005 James Cleverdon, IBM.
+ * Subject to the GNU Public License, v.2
+ *
+ * gsi2irq.c:
+ *
+ * IRQ and vector compression/sharing routines for i386 and x86-64 by
+ * James Cleverdon from a patch by Natalie Protasevich and
+ * architecture code from io_apic.c
+ */
+#include <linux/config.h>
+#include <asm/smp.h>
+#include <linux/irq.h>
+#include <asm/hw_irq.h>
+
+#ifdef	CONFIG_X86_64
+
+#include <asm/irq.h>
+
+#else	/* X86 */
+
+#include <irq_vectors_limits.h>
+
+#endif	/* X86 */
+
+
+#ifdef CONFIG_X86_IO_APIC
+
+extern int __assign_irq_vector(int irq);
+
+static int next_irq = 16;
+
+static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF };
+
+static DEFINE_SPINLOCK(gsi_irq_lock);
+
+#define DEBUG_GSI_IRQ_SHARING	1
+
+#ifdef DEBUG_GSI_IRQ_SHARING
+
+#undef KERN_INFO	/* Raise printk level. */
+#define KERN_INFO	KERN_ERR
+
+#define DEBUG_GSI_FORCE_SHARE_N	3
+static int gsi_irq_dbg_cnt = DEBUG_GSI_FORCE_SHARE_N;
+static int gsi_irq_dbg_irq = 16;
+
+#endif	/* DEBUG_GSI_IRQ_SHARING */
+
+
+/*
+ * gsi_irq_sharing -- Name overload!  "irq" can be either a legacy IRQ
+ * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number
+ * from ACPI, which can reach 800 in large boxen.
+ *
+ * Compact the sparse GSI space into a sequential IRQ series and reuse
+ * vectors if possible.
+ */
+int gsi_irq_sharing(int gsi)
+{
+	int i, tries, vector;
+	unsigned long flags;
+
+	BUG_ON(gsi >= NR_IRQ_VECTORS);
+
+	if (platform_legacy_irq(gsi)) {
+		gsi_2_irq[gsi] = gsi;
+		return gsi;
+	}
+
+	if (gsi_2_irq[gsi] != 0xFF)
+		return (int)gsi_2_irq[gsi];
+
+	tries = NR_IRQS;
+  try_again:
+	vector = __assign_irq_vector(gsi);
+
+	/*
+	 * Sharing vectors means sharing IRQs, so scan irq_vectors for previous
+	 * use of vector and if found, return that IRQ.  However, we never want
+	 * to share legacy IRQs, which usually have a different trigger mode
+	 * than PCI.
+	 */
+	for (i = 0; i < NR_IRQS; i++)
+		if (IO_APIC_VECTOR(i) == vector)
+			break;
+	if (platform_legacy_irq(i)) {
+		if (--tries >= 0)
+			goto try_again;
+		panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi);
+	}
+	if (i < NR_IRQS) {
+		gsi_2_irq[gsi] = i;
+		printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n",
+				gsi, vector, i);
+		return i;
+	}
+
+	spin_lock_irqsave(&gsi_irq_lock, flags);
+#ifdef DEBUG_GSI_IRQ_SHARING
+	if (--gsi_irq_dbg_cnt < 0) {
+		/* Debug:  Force sharing on 1 of N IRQs.  N must be > 1. */
+		gsi_irq_dbg_cnt = DEBUG_GSI_FORCE_SHARE_N;
+		i = gsi_irq_dbg_irq++;
+		if (gsi_irq_dbg_irq >= next_irq)
+			gsi_irq_dbg_irq = 16;
+		gsi_2_irq[gsi] = i;
+		spin_unlock_irqrestore(&gsi_irq_lock, flags);
+		printk(KERN_INFO "GSI %d debug-share vector 0x%02X and IRQ %d\n",
+				gsi, IO_APIC_VECTOR(i), i);
+		return i;
+	}
+#endif	/* DEBUG_GSI_IRQ_SHARING */
+	i = next_irq++;
+	BUG_ON(i >= NR_IRQS);
+	gsi_2_irq[gsi] = i;
+	IO_APIC_VECTOR(i) = vector;
+	vector_irq[vector] = i;
+	spin_unlock_irqrestore(&gsi_irq_lock, flags);
+	printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n",
+			gsi, vector, i);
+	return i;
+}
+
+#else	/* CONFIG_X86_IO_APIC */
+
+/* No compression needed if no I/O APICs. */
+
+int gsi_irq_sharing(int gsi)
+{
+	return gsi;
+}
+
+#endif	/* CONFIG_X86_IO_APIC */
diff -pruN 2.6.12.3/arch/i386/kernel/io_apic.c z12.3/arch/i386/kernel/io_apic.c
--- 2.6.12.3/arch/i386/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/io_apic.c	2005-08-15 14:40:38.000000000 -0700
@@ -62,7 +62,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -1071,6 +1071,7 @@ static int pin_2_irq(int idx, int apic, 
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 
 			/*
 			 * For MPS mode, so far only needed by ES7000 platform
@@ -1127,11 +1128,11 @@ static inline int IO_APIC_irq_trigger(in
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-int assign_irq_vector(int irq)
+int __assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
 	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
@@ -1140,17 +1141,24 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	vector_irq[current_vector] = irq;
+	return current_vector;
+}
+
+int assign_irq_vector(int irq)
+{
+	int vect;
+
+	vect = __assign_irq_vector(irq);
+	vector_irq[vect] = irq;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vect;
 
-	return current_vector;
+	return vect;
 }
 
 static struct hw_interrupt_type ioapic_level_type;
@@ -2514,6 +2522,7 @@ int io_apic_set_pci_routing (int ioapic,
 	entry.polarity = active_high_low;
 	entry.mask  = 1;
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff -pruN 2.6.12.3/arch/i386/kernel/mpparse.c z12.3/arch/i386/kernel/mpparse.c
--- 2.6.12.3/arch/i386/kernel/mpparse.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/i386/kernel/mpparse.c	2005-08-14 15:33:52.000000000 -0700
@@ -274,7 +274,7 @@ static void __init MP_intsrc_info (struc
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff -pruN 2.6.12.3/arch/x86_64/kernel/Makefile z12.3/arch/x86_64/kernel/Makefile
--- 2.6.12.3/arch/x86_64/kernel/Makefile	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/Makefile	2005-08-15 15:06:26.000000000 -0700
@@ -7,7 +7,7 @@ EXTRA_AFLAGS	:= -traditional
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
-		setup64.o bootflag.o e820.o reboot.o quirks.o
+		setup64.o bootflag.o e820.o reboot.o quirks.o gsi2irq.o
 
 obj-$(CONFIG_X86_MCE)         += mce.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
@@ -44,3 +44,4 @@ swiotlb-$(CONFIG_SWIOTLB)      += ../../
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
 intel_cacheinfo-y		+= ../../i386/kernel/cpu/intel_cacheinfo.o
 quirks-y			+= ../../i386/kernel/quirks.o
+gsi2irq-y			+= ../../i386/kernel/gsi2irq.o
diff -pruN 2.6.12.3/arch/x86_64/kernel/io_apic.c z12.3/arch/x86_64/kernel/io_apic.c
--- 2.6.12.3/arch/x86_64/kernel/io_apic.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/io_apic.c	2005-08-15 15:18:07.000000000 -0700
@@ -56,7 +56,7 @@ int nr_ioapic_registers[MAX_IO_APICS];
  * Rough estimation of how many shared IRQs there are, can
  * be changed anytime.
  */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define MAX_PLUS_SHARED_IRQS NR_IRQ_VECTORS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
 
 /*
@@ -88,6 +88,7 @@ static void add_pin_to_irq(unsigned int 
 	static int first_free_entry = NR_IRQS;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 
+	BUG_ON(irq >= NR_IRQS);
 	while (entry->next)
 		entry = irq_2_pin + entry->next;
 
@@ -95,7 +96,7 @@ static void add_pin_to_irq(unsigned int 
 		entry->next = first_free_entry;
 		entry = irq_2_pin + entry->next;
 		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: whoops");
+			panic("io_apic.c: ran out of irq_2_pin entries!");
 	}
 	entry->apic = apic;
 	entry->pin = pin;
@@ -610,6 +611,7 @@ static int pin_2_irq(int idx, int apic, 
 			while (i < apic)
 				irq += nr_ioapic_registers[i++];
 			irq += pin;
+			irq = gsi_irq_sharing(irq);
 			break;
 		}
 		default:
@@ -619,6 +621,7 @@ static int pin_2_irq(int idx, int apic, 
 			break;
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 
 	/*
 	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
@@ -634,6 +637,7 @@ static int pin_2_irq(int idx, int apic, 
 			}
 		}
 	}
+	BUG_ON(irq >= NR_IRQS);
 	return irq;
 }
 
@@ -657,12 +661,12 @@ static inline int IO_APIC_irq_trigger(in
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-int assign_irq_vector(int irq)
+int __assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
 
-	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (IO_APIC_VECTOR(irq) > 0)
+	BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS);
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
 	current_vector += 8;
@@ -670,17 +674,24 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset++;
-		if (!(offset%8))
-			return -ENOSPC;
+		/* If we run out of vectors on large boxen, must share them. */
+		offset = (offset + 1) % 8;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	vector_irq[current_vector] = irq;
+	return current_vector;
+}
+
+int assign_irq_vector(int irq)
+{
+	int vect;
+
+	vect = __assign_irq_vector(irq);
+	vector_irq[vect] = irq;
 	if (irq != AUTO_ASSIGN)
-		IO_APIC_VECTOR(irq) = current_vector;
+		IO_APIC_VECTOR(irq) = vect;
 
-	return current_vector;
+	return vect;
 }
 
 extern void (*interrupt[NR_IRQS])(void);
@@ -1866,6 +1877,7 @@ int io_apic_set_pci_routing (int ioapic,
 	entry.polarity = active_high_low;
 	entry.mask = 1;					 /* Disabled (masked) */
 
+	irq = gsi_irq_sharing(irq);
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
diff -pruN 2.6.12.3/arch/x86_64/kernel/mpparse.c z12.3/arch/x86_64/kernel/mpparse.c
--- 2.6.12.3/arch/x86_64/kernel/mpparse.c	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/arch/x86_64/kernel/mpparse.c	2005-08-11 19:34:53.000000000 -0700
@@ -214,7 +214,7 @@ static void __init MP_intsrc_info (struc
 			m->mpc_irqtype, m->mpc_irqflag & 3,
 			(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
 			m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
-	if (++mp_irq_entries == MAX_IRQ_SOURCES)
+	if (++mp_irq_entries >= MAX_IRQ_SOURCES)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
diff -pruN 2.6.12.3/include/asm-i386/apic.h z12.3/include/asm-i386/apic.h
--- 2.6.12.3/include/asm-i386/apic.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-i386/apic.h	2005-08-14 15:34:49.000000000 -0700
@@ -108,6 +108,7 @@ extern void nmi_watchdog_tick (struct pt
 extern int APIC_init_uniprocessor (void);
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
+extern int gsi_irq_sharing(int gsi);
 
 extern void enable_NMI_through_LVT0 (void * dummy);
 
diff -pruN 2.6.12.3/include/asm-i386/hw_irq.h z12.3/include/asm-i386/hw_irq.h
--- 2.6.12.3/include/asm-i386/hw_irq.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-i386/hw_irq.h	2005-08-15 15:14:32.000000000 -0700
@@ -28,6 +28,7 @@
 extern u8 irq_vector[NR_IRQ_VECTORS];
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
 #define AUTO_ASSIGN		-1
+extern int vector_irq[NR_VECTORS];
 
 extern void (*interrupt[NR_IRQS])(void);
 
diff -pruN 2.6.12.3/include/asm-i386/mach-generic/mach_mpspec.h z12.3/include/asm-i386/mach-generic/mach_mpspec.h
--- 2.6.12.3/include/asm-i386/mach-generic/mach_mpspec.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-i386/mach-generic/mach_mpspec.h	2005-08-14 15:39:10.000000000 -0700
@@ -1,7 +1,8 @@
 #ifndef __ASM_MACH_MPSPEC_H
 #define __ASM_MACH_MPSPEC_H
 
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 
 /* Summit or generic (i.e. installer) kernels need lots of bus entries. */
 /* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
diff -pruN 2.6.12.3/include/asm-x86_64/apic.h z12.3/include/asm-x86_64/apic.h
--- 2.6.12.3/include/asm-x86_64/apic.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-x86_64/apic.h	2005-08-14 15:36:51.000000000 -0700
@@ -98,6 +98,7 @@ extern int APIC_init_uniprocessor (void)
 extern void disable_APIC_timer(void);
 extern void enable_APIC_timer(void);
 extern void clustered_apic_check(void);
+extern int gsi_irq_sharing(int gsi);
 
 extern void nmi_watchdog_default(void);
 extern int setup_nmi_watchdog(char *);
diff -pruN 2.6.12.3/include/asm-x86_64/hw_irq.h z12.3/include/asm-x86_64/hw_irq.h
--- 2.6.12.3/include/asm-x86_64/hw_irq.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-x86_64/hw_irq.h	2005-08-15 15:14:41.000000000 -0700
@@ -79,6 +75,7 @@ struct hw_interrupt_type;
 extern u8 irq_vector[NR_IRQ_VECTORS];
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
 #define AUTO_ASSIGN		-1
+extern int vector_irq[NR_VECTORS];
 
 /*
  * Various low-level irq details needed by irq.c, process.c,
diff -pruN 2.6.12.3/include/asm-x86_64/mpspec.h z12.3/include/asm-x86_64/mpspec.h
--- 2.6.12.3/include/asm-x86_64/mpspec.h	2005-07-15 14:18:57.000000000 -0700
+++ z12.3/include/asm-x86_64/mpspec.h	2005-08-10 17:08:45.000000000 -0700
@@ -157,7 +157,8 @@ struct mpc_config_lintsrc
  */
 
 #define MAX_MP_BUSSES 256
-#define MAX_IRQ_SOURCES 256
+/* Each PCI slot may be a combo card with its own bus.  4 IRQ pins per slot. */
+#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
 enum mp_bustype {
 	MP_BUS_ISA = 1,
 	MP_BUS_EISA,

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [RFC][2.6.12.3] IRQ compression/sharing patch
  2005-08-16  3:24           ` James Cleverdon
@ 2005-08-16  6:58             ` Andi Kleen
  0 siblings, 0 replies; 21+ messages in thread
From: Andi Kleen @ 2005-08-16  6:58 UTC (permalink / raw)
  To: James Cleverdon
  Cc: Andi Kleen, Protasevich, Natalie, Russ Weight, linux-kernel


Patch looks good to me now. Thanks.
-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2005-08-16  6:58 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-08-11  0:21 [RFC][2.6.12.3] IRQ compression/sharing patch Protasevich, Natalie
2005-08-11  3:14 ` James Cleverdon
  -- strict thread matches above, loose matches on Subject: below --
2005-08-15  4:35 Protasevich, Natalie
2005-08-15 17:11 ` James Cleverdon
2005-08-11 22:02 Protasevich, Natalie
2005-08-11 22:34 ` Zwane Mwaikambo
2005-08-11 21:55 Protasevich, Natalie
2005-08-12  1:07 ` James Cleverdon
2005-08-12  2:59 ` James Cleverdon
2005-08-11 13:15 Protasevich, Natalie
2005-08-11 17:24 ` James Cleverdon
2005-08-10 21:03 Protasevich, Natalie
2005-08-10 23:55 ` James Cleverdon
2005-08-11 17:52 ` Zwane Mwaikambo
2005-07-26  7:12 [RFC][2.6.13-rc3-mm1] " James Cleverdon
2005-07-26 16:03 ` Andi Kleen
2005-08-04  7:05   ` [RFC][2.6.12.3] " James Cleverdon
2005-08-04  9:22     ` Andi Kleen
2005-08-15  2:57       ` James Cleverdon
2005-08-15  5:55         ` Zwane Mwaikambo
2005-08-15 17:44         ` Andi Kleen
2005-08-16  3:24           ` James Cleverdon
2005-08-16  6:58             ` Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox