public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: William Lee Irwin III <wli@holomorphy.com>
To: linux-kernel@vger.kernel.org, zwane@linuxpower.ca, zab@zabbo.net,
	manfred@colorfullife.com, macro@ds2.pg.gda.pl,
	Martin.Bligh@us.ibm.com, jamesclv@us.ibm.com
Cc: andrew.grover@intel.com
Subject: Re: 48GB NUMA-Q boots, with major IO-APIC hassles
Date: Sat, 18 Jan 2003 17:50:13 -0800	[thread overview]
Message-ID: <20030119015013.GB780@holomorphy.com> (raw)
In-Reply-To: <20030119014326.GB789@holomorphy.com>

On Wed, Jan 15, 2003 at 02:58:02AM -0800, William Lee Irwin III wrote:
>> (1) I've got 320 IRQ sources. This panic()'s in setup_IO_APIC_irqs().

On Sat, Jan 18, 2003 at 05:43:26PM -0800, William Lee Irwin III wrote:
> Where do you go for IO-APIC issues? Well, my MP tables say (Zwane
> pointed out fsmp@FreeBSD.org's mptable code):

Okay, and here is my latest attempt to deal with the issue (which is
dirty as sin code-wise, but nm that... I'm trying to debug this).

This doesn't actually work, I end up deadlocking presumably because
everything's waiting for an interrupt that's been dropped at some point.


diff -urpN mm1-2.5.59/arch/i386/kernel/io_apic.c irq-2.5.59-1/arch/i386/kernel/io_apic.c
--- mm1-2.5.59/arch/i386/kernel/io_apic.c	2003-01-17 01:04:43.000000000 -0800
+++ irq-2.5.59-1/arch/i386/kernel/io_apic.c	2003-01-18 15:15:01.000000000 -0800
@@ -647,6 +647,7 @@ static int __init find_irq_entry(int api
 
 	for (i = 0; i < mp_irq_entries; i++)
 		if (mp_irqs[i].mpc_irqtype == type &&
+		    mp_bus_id_to_node[mp_irqs[i].mpc_srcbus] == apic/2 &&
 		    (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
 		     mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
 		    mp_irqs[i].mpc_dstirq == pin)
@@ -696,8 +697,9 @@ int IO_APIC_get_PCI_irq_vector(int bus, 
 		int lbus = mp_irqs[i].mpc_srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
-			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+			if ((mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic ||
+			    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+		    	    mp_bus_id_to_node[mp_irqs[i].mpc_srcbus] == apic/2)
 				break;
 
 		if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
@@ -914,6 +916,12 @@ static int pin_2_irq(int idx, int apic, 
 	int irq, i;
 	int bus = mp_irqs[idx].mpc_srcbus;
 
+#ifdef CONFIG_X86_NUMAQ
+	if (mp_bus_id_to_node[bus] != apic/2)
+		printk(KERN_ERR "bus %d on node %d, apic %d on node %d\n",
+			bus, mp_bus_id_to_node[bus], apic, apic/2);
+#endif
+
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
 	 */
@@ -930,6 +938,10 @@ static int pin_2_irq(int idx, int apic, 
 			break;
 		}
 		case MP_BUS_PCI: /* PCI pin */
+#ifdef CONFIG_X86_NUMAQ
+		irq = apic_pin_to_irq[apic][pin];
+		break;
+#else
 		{
 			/*
 			 * PCI IRQs are mapped in order
@@ -940,6 +952,7 @@ static int pin_2_irq(int idx, int apic, 
 			irq += pin;
 			break;
 		}
+#endif
 		default:
 		{
 			printk(KERN_ERR "unknown bus type %d.\n",bus); 
@@ -984,6 +997,80 @@ static inline int IO_APIC_irq_trigger(in
 
 int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };
 
+#ifdef CONFIG_X86_NUMAQ
+
+int vector_to_irq[MAX_NUMNODES][FIRST_SYSTEM_VECTOR - FIRST_DEVICE_VECTOR + 1];
+int apic_pin_to_irq[MAX_IO_APICS][24];
+
+/*
+ * timer vectors must always go to 0
+ * vectors < FIRST_DEVICE_VECTOR are 1:1
+ * everything else goes through the table
+ */
+
+static void __init init_vector_to_irq(void)
+{
+	int n, v;
+	for (n = 0; n < MAX_NUMNODES; ++n) {
+		for (v = 1; v <= FIRST_SYSTEM_VECTOR - FIRST_DEVICE_VECTOR; ++v)
+			vector_to_irq[n][v] = -1;
+		vector_to_irq[n][0] = 0;
+	}
+	for (n = 0; n < MAX_IO_APICS; ++n)
+		for (v = 0; v < 24; ++v)
+			apic_pin_to_irq[n][v] = -1;
+}
+
+int irq_of_vector(int vector)
+{
+	int irq;
+	if (vector < FIRST_DEVICE_VECTOR)
+		irq = vector;
+	else
+		irq = vector_to_irq[numa_node_id()][vector-FIRST_DEVICE_VECTOR];
+	return irq;
+}
+
+static void set_irq_of_vector(int apic, int vector, int irq)
+{
+	vector_to_irq[apic/2][vector-FIRST_DEVICE_VECTOR] = irq;
+}
+
+static void set_irq_of_pin(int apic, int pin, int irq)
+{
+	apic_pin_to_irq[apic][pin] = irq;
+}
+
+static int __init next_irq_vector(int vector)
+{
+	++vector;
+	if (vector >= FIRST_SYSTEM_VECTOR)
+		vector = FIRST_DEVICE_VECTOR + 1;
+	else if (vector == SYSCALL_VECTOR)
+		++vector;
+	return vector;
+}
+
+static int __init assign_irq_vector(int irq)
+{
+	static int current_vector = FIRST_DEVICE_VECTOR+1;
+	if (!irq)
+		return FIRST_DEVICE_VECTOR;
+	else if (!irq_vector[irq]) {
+		irq_vector[irq] = current_vector;
+		current_vector = next_irq_vector(current_vector);
+	}
+	return irq_vector[irq];
+}
+
+#else
+
+#define init_vector_to_irq()		do {} while (0)
+#define set_irq_of_vector(a,v,i)	do {} while (0)
+#define set_irq_of_pin(a,v,i)	do {} while (0)
+
+int irq_of_vector(int vector)	{ return vector; }
+
 static int __init assign_irq_vector(int irq)
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
@@ -1005,6 +1092,7 @@ next:
 	IO_APIC_VECTOR(irq) = current_vector;
 	return current_vector;
 }
+#endif
 
 static struct hw_interrupt_type ioapic_level_irq_type;
 static struct hw_interrupt_type ioapic_edge_irq_type;
@@ -1017,6 +1105,8 @@ void __init setup_IO_APIC_irqs(void)
 
 	printk(KERN_DEBUG "init IO_APIC IRQs\n");
 
+	init_vector_to_irq();
+
 	for (apic = 0; apic < nr_ioapics; apic++) {
 	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
 
@@ -1064,13 +1154,19 @@ void __init setup_IO_APIC_irqs(void)
 		if (IO_APIC_IRQ(irq)) {
 			vector = assign_irq_vector(irq);
 			entry.vector = vector;
+			set_irq_of_vector(apic, vector, irq);
+			set_irq_of_pin(apic, pin, irq);
 
 			if (IO_APIC_irq_trigger(irq))
 				irq_desc[irq].handler = &ioapic_level_irq_type;
 			else
 				irq_desc[irq].handler = &ioapic_edge_irq_type;
 
+#ifdef CONFIG_X86_NUMAQ
+			set_intr_gate(vector, interrupt[vector]);
+#else
 			set_intr_gate(vector, interrupt[irq]);
+#endif
 		
 			if (!apic && (irq < 16))
 				disable_8259A_irq(irq);
@@ -1457,6 +1553,7 @@ static void __init setup_ioapic_ids_from
 	 * Set the IOAPIC ID to the value stored in the MPC table.
 	 */
 	for (apic = 0; apic < nr_ioapics; apic++) {
+		unsigned long numaq_ioapic_id;
 
 		/* Read the register 0 value */
 		spin_lock_irqsave(&ioapic_lock, flags);
@@ -1465,6 +1562,7 @@ static void __init setup_ioapic_ids_from
 		
 		old_id = mp_ioapics[apic].mpc_apicid;
 
+#ifndef CONFIG_X86_NUMAQ
 		if (mp_ioapics[apic].mpc_apicid >= APIC_BROADCAST_ID) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
 				apic, mp_ioapics[apic].mpc_apicid);
@@ -1495,6 +1593,7 @@ static void __init setup_ioapic_ids_from
 			printk("Setting %d in the phys_id_present_map\n", mp_ioapics[apic].mpc_apicid);
 			phys_id_present_map |= 1 << mp_ioapics[apic].mpc_apicid;
 		}
+#endif /* CONFIG_X86_NUMAQ */
 
 
 		/*
@@ -1507,14 +1606,19 @@ static void __init setup_ioapic_ids_from
 					mp_irqs[i].mpc_dstapic
 						= mp_ioapics[apic].mpc_apicid;
 
+#ifdef CONFIG_X86_NUMAQ
+		numaq_ioapic_id = (mp_ioapics[apic].mpc_apicid & 1) ? 13 : 14;
+#else
+		numaq_ioapic_id = mp_ioapics[apic].mpc_apicid;
+#endif /* CONFIG_X86_NUMAQ */
+
 		/*
 		 * Read the right value from the MPC table and
 		 * write it into the ID register.
 	 	 */
-		printk(KERN_INFO "...changing IO-APIC physical APIC ID to %d ...",
-					mp_ioapics[apic].mpc_apicid);
+		printk(KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", (int)numaq_ioapic_id);
 
-		reg_00.ID = mp_ioapics[apic].mpc_apicid;
+		reg_00.ID = numaq_ioapic_id;
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0, *(int *)&reg_00);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -1525,7 +1629,7 @@ static void __init setup_ioapic_ids_from
 		spin_lock_irqsave(&ioapic_lock, flags);
 		*(int *)&reg_00 = io_apic_read(apic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.ID != mp_ioapics[apic].mpc_apicid)
+		if (reg_00.ID != numaq_ioapic_id)
 			panic("could not set ID!\n");
 		else
 			printk(" ok.\n");
@@ -1741,7 +1845,7 @@ static struct hw_interrupt_type ioapic_l
 	set_ioapic_affinity,
 };
 
-static inline void init_IO_APIC_traps(void)
+static void init_IO_APIC_traps(void)
 {
 	int irq;
 
@@ -1765,9 +1869,12 @@ static inline void init_IO_APIC_traps(vo
 			 */
 			if (irq < 16)
 				make_8259A_irq(irq);
-			else
+			else {
 				/* Strange. Oh, well.. */
 				irq_desc[irq].handler = &no_irq_type;
+				printk("init_IO_APIC_traps():"
+					"unhandled irq %d\n", irq);
+			}
 		}
 	}
 }
@@ -1915,7 +2022,11 @@ static inline void check_timer(void)
 	 */
 	disable_8259A_irq(0);
 	vector = assign_irq_vector(0);
+#ifdef CONFIG_X86_NUMAQ
+	set_intr_gate(vector, interrupt[vector]);
+#else
 	set_intr_gate(vector, interrupt[0]);
+#endif
 
 	/*
 	 * Subtle, code in do_timer_interrupt() expects an AEOI
diff -urpN mm1-2.5.59/arch/i386/kernel/irq.c irq-2.5.59-1/arch/i386/kernel/irq.c
--- mm1-2.5.59/arch/i386/kernel/irq.c	2003-01-17 01:04:43.000000000 -0800
+++ irq-2.5.59-1/arch/i386/kernel/irq.c	2003-01-18 11:11:40.000000000 -0800
@@ -69,6 +69,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline
 	{ [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}};
 
 static void register_irq_proc (unsigned int irq);
+int irq_of_vector(int);
 
 /*
  * Special irq handlers.
@@ -92,6 +93,7 @@ static void ack_none(unsigned int irq)
  */
 #if CONFIG_X86
 	printk("unexpected IRQ trap at vector %02x\n", irq);
+	dump_stack();
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * Currently unexpected vectors happen only on SMP and APIC.
@@ -323,12 +325,19 @@ asmlinkage unsigned int do_IRQ(struct pt
 	 * 0 return value means that this irq is already being
 	 * handled by some other CPU. (or is disabled)
 	 */
-	int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code  */
+	/* high bits used in ret_from_ code  */
+	int irq = irq_of_vector(regs.orig_eax & 0xff);
 	int cpu = smp_processor_id();
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
 
+	if (irq < 0) {
+		printk("bad vector %ld, irq %d\n", regs.orig_eax & 0xff, irq);
+		dump_stack();
+		return 1;
+	}
+
 	irq_enter();
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
diff -urpN mm1-2.5.59/arch/i386/pci/numa.c irq-2.5.59-1/arch/i386/pci/numa.c
--- mm1-2.5.59/arch/i386/pci/numa.c	2003-01-16 18:21:44.000000000 -0800
+++ irq-2.5.59-1/arch/i386/pci/numa.c	2003-01-18 11:11:40.000000000 -0800
@@ -117,6 +117,14 @@ struct pci_fixup pcibios_fixups[] = {
 	{ PCI_FIXUP_HEADER,	PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_82451NX,	pci_fixup_i450nx },
 };
 
+void pci_fixup_child(struct pci_bus *parent, struct pci_bus *child, unsigned int buses)
+{
+	int quad = BUS2QUAD(parent->number);
+	child->primary = QUADLOCAL2BUS(quad, buses & 0xFF);
+	child->secondary = QUADLOCAL2BUS(quad, (buses >> 8) & 0xFF);
+	child->subordinate = QUADLOCAL2BUS(quad, (buses >> 16) & 0xFF);
+}
+
 static int __init pci_numa_init(void)
 {
 	int quad;
@@ -127,7 +135,7 @@ static int __init pci_numa_init(void)
 		return 0;
 
 	pci_root_bus = pcibios_scan_root(0);
-	if (numnodes > 1) {
+	if (0 && numnodes > 1) {
 		for (quad = 1; quad < numnodes; ++quad) {
 			printk("Scanning PCI bus %d for quad %d\n", 
 				QUADLOCAL2BUS(quad,0), quad);
diff -urpN mm1-2.5.59/drivers/pci/probe.c irq-2.5.59-1/drivers/pci/probe.c
--- mm1-2.5.59/drivers/pci/probe.c	2003-01-16 18:22:24.000000000 -0800
+++ irq-2.5.59-1/drivers/pci/probe.c	2003-01-18 11:11:40.000000000 -0800
@@ -244,6 +244,17 @@ struct pci_bus * __devinit pci_add_new_b
 	return child;
 }
 
+#ifdef CONFIG_X86_NUMAQ
+void pci_fixup_child(struct pci_bus *, struct pci_bus *, int);
+#else
+void pci_fixup_child(struct pci_bus *parent, struct pci_bus *child, unsigned int buses)
+{
+	child->primary = buses & 0xFF;
+	child->secondary = (buses >> 8) & 0xFF;
+	child->subordinate = (buses >> 16) & 0xFF;
+}
+#endif
+
 /*
  * If it's a bridge, configure it and scan the bus behind it.
  * For CardBus bridges, we don't scan behind as the devices will
@@ -271,9 +282,7 @@ int __devinit pci_scan_bridge(struct pci
 		if (pass)
 			return max;
 		child = pci_add_new_bus(bus, dev, 0);
-		child->primary = buses & 0xFF;
-		child->secondary = (buses >> 8) & 0xFF;
-		child->subordinate = (buses >> 16) & 0xFF;
+		pci_fixup_child(bus, child, buses);
 		child->number = child->secondary;
 		cmax = pci_do_scan_bus(child);
 		if (cmax > max) max = cmax;
diff -urpN mm1-2.5.59/include/asm-i386/hardirq.h irq-2.5.59-1/include/asm-i386/hardirq.h
--- mm1-2.5.59/include/asm-i386/hardirq.h	2003-01-16 18:22:26.000000000 -0800
+++ irq-2.5.59-1/include/asm-i386/hardirq.h	2003-01-18 11:11:40.000000000 -0800
@@ -34,7 +34,11 @@ typedef struct {
 
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
+#ifdef CONFIG_X86_NUMAQ
+#define HARDIRQ_BITS	9
+#else
 #define HARDIRQ_BITS	8
+#endif
 
 #define PREEMPT_SHIFT	0
 #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
diff -urpN mm1-2.5.59/include/asm-i386/mach-numaq/irq_vectors.h irq-2.5.59-1/include/asm-i386/mach-numaq/irq_vectors.h
--- mm1-2.5.59/include/asm-i386/mach-numaq/irq_vectors.h	1969-12-31 16:00:00.000000000 -0800
+++ irq-2.5.59-1/include/asm-i386/mach-numaq/irq_vectors.h	2003-01-18 11:11:40.000000000 -0800
@@ -0,0 +1,81 @@
+/*
+ * This file should contain #defines for all of the interrupt vector
+ * numbers used by this architecture.
+ *
+ * In addition, there are some standard defines:
+ *
+ *	FIRST_EXTERNAL_VECTOR:
+ *		The first free place for external interrupts
+ *
+ *	SYSCALL_VECTOR:
+ *		The IRQ vector a syscall makes the user to kernel transition
+ *		under.
+ *
+ *	TIMER_IRQ:
+ *		The IRQ number the timer interrupt comes in at.
+ *
+ *	NR_IRQS:
+ *		The total number of interrupt vectors (including all the
+ *		architecture specific interrupts) needed.
+ *
+ */			
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR	0x20
+
+#define SYSCALL_VECTOR		0x80
+
+/*
+ * Vectors 0x20-0x2f are used for ISA interrupts.
+ */
+
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ *  some of the following vectors are 'rare', they are merged
+ *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ *  TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#define SPURIOUS_APIC_VECTOR	0xff
+#define ERROR_APIC_VECTOR	0xfe
+#define INVALIDATE_TLB_VECTOR	0xfd
+#define RESCHEDULE_VECTOR	0xfc
+#define CALL_FUNCTION_VECTOR	0xfb
+
+#define THERMAL_APIC_VECTOR	0xf0
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR	0xef
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee)
+ * we start at 0x31 to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR	0x31
+#define FIRST_SYSTEM_VECTOR	0xef
+
+#define TIMER_IRQ 0
+
+/*
+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
+ * Right now the APIC is mostly only used for SMP.
+ * 256 vectors is an architectural limit. (we can have
+ * more than 256 devices theoretically, but they will
+ * have to use shared interrupts)
+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
+ * the usable vector space is 0x20-0xff (224 vectors)
+ */
+#define NR_IRQS 512
+
+#endif /* _ASM_IRQ_VECTORS_H */

  reply	other threads:[~2003-01-19  1:41 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-01-15 10:58 48GB NUMA-Q boots, with major IO-APIC hassles William Lee Irwin III
2003-01-15 11:24 ` Anton Blanchard
2003-01-15 11:55   ` William Lee Irwin III
2003-01-15 12:32     ` Anton Blanchard
2003-01-15 13:10       ` William Lee Irwin III
2003-01-15 15:24 ` Martin J. Bligh
2003-01-15 15:34   ` William Lee Irwin III
2003-01-19  1:43 ` William Lee Irwin III
2003-01-19  1:50   ` William Lee Irwin III [this message]
2003-01-19  2:13     ` Zwane Mwaikambo
2003-01-19  2:27       ` William Lee Irwin III
2003-01-19  2:32     ` Zwane Mwaikambo
2003-01-19  2:55       ` William Lee Irwin III
2003-01-19  3:08         ` William Lee Irwin III
2003-03-28  5:08 ` William Lee Irwin III
  -- strict thread matches above, loose matches on Subject: below --
2003-01-15 17:32 Protasevich, Natalie
2003-01-15 22:01 ` Martin J. Bligh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20030119015013.GB780@holomorphy.com \
    --to=wli@holomorphy.com \
    --cc=Martin.Bligh@us.ibm.com \
    --cc=andrew.grover@intel.com \
    --cc=jamesclv@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=macro@ds2.pg.gda.pl \
    --cc=manfred@colorfullife.com \
    --cc=zab@zabbo.net \
    --cc=zwane@linuxpower.ca \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox