public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
  2002-08-13 23:30   ` Andrea Arcangeli
@ 2002-08-23  2:31     ` James Cleverdon
  0 siblings, 0 replies; 8+ messages in thread
From: James Cleverdon @ 2002-08-23  2:31 UTC (permalink / raw)
  To: Andrea Arcangeli, Andrew Theurer, acpi-devel
  Cc: Linus Torvalds, Martin J. Bligh, Alan Cox, linux-kernel,
	Dave Jones

Here's my first cut of the 2.5 summit patch that allows you to boot the x440 
NUMA box and actually get all CPUs on-line.  While similar to the patch in 
Alan's 2.4 tree (and in SuSE 8.0), this patch uses logical mode interrupts so 
that we can make the TPR hardware to do real time IRQ routing to less busy 
CPUs.  As a result, this code may do P3 and earlier systems some good as 
well.  No need for the balance_irq function (which is crudely commented out) 
on P4 boxen.

What's the catch?  I'm glad you asked.  On my test systems this drops all SCSI 
interrupts when the ACPI hyperthreading-only config option is turned on.  The 
system boots fine when turned off, using the MPS table.  Funny thing:  the 
IRQ table shows 38 entries with MPS but only 18 for ACPI -- just about what 
you'd expect for the legacy IRQs plus some interrupt source overrides.  Does 
anyone know if this is expected behavior?  If not, what happened to the other 
IRQs?

Note:  I can't do a thing about the xAPIC bridge HW's tie breaker rule.  On 
idle systems the lowest numbered CPU in each APIC cluster is going to be hit 
by most of the interrupts.  So what?  It was idle anyway.  On busier systems, 
the interrupt counts start evening out.  So, folks should not expect 
balance_irq's nicely spread IRQ counts across all CPUs, but can hopefully 
enjoy some performance gains instead.

Anyway, here it is.  Applies to 2.5.31.  Comments and advice are very welcome:

diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
--- 2.5.31/arch/i386/kernel/acpi.c	Sat Aug 10 18:41:53 2002
+++ s31/arch/i386/kernel/acpi.c	Wed Aug 14 19:30:13 2002
@@ -114,6 +114,7 @@
 	unsigned long		size)
 {
 	struct acpi_table_madt	*madt = NULL;
+	extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);
 
 	if (!phys_addr || !size)
 		return -EINVAL;
@@ -130,6 +131,8 @@
 	printk(KERN_INFO PREFIX "Local APIC address 0x%08x\n",
 		madt->lapic_address);
 
+	acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+
 	return 0;
 }
 
@@ -301,6 +304,7 @@
 	char			*cmdline)
 {
 	int			result = 0;
+	extern void		smp_cluster_apic_check(void);
 
 	/*
 	 * The default interrupt routing model is PIC (8259).  This gets
@@ -416,8 +420,10 @@
 #endif /*CONFIG_X86_IO_APIC*/
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	if (acpi_lapic && acpi_ioapic)
+	if (acpi_lapic && acpi_ioapic) {
 		smp_found_config = 1;
+		smp_cluster_apic_check();
+	}
 #endif
 
 	return 0;
diff -ruN 2.5.31/arch/i386/kernel/apic.c s31/arch/i386/kernel/apic.c
--- 2.5.31/arch/i386/kernel/apic.c	Sat Aug 10 18:41:29 2002
+++ s31/arch/i386/kernel/apic.c	Wed Aug 14 19:30:13 2002
@@ -29,6 +29,7 @@
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
+#include <asm/smpboot.h>
 
 /* Using APIC to generate smp_local_timer_interrupt? */
 int using_apic_timer = 0;
@@ -272,6 +273,16 @@
 	apic_write_around(APIC_LVT1, value);
 }
 
+static inline unsigned long apic_ldr_value(unsigned long value)
+{
+	if (clustered_apic_numaq)
+		return (value);
+	if (clustered_apic_xapic)
+		return (((value) & ~APIC_LDR_MASK) |
+			SET_APIC_LOGICAL_ID(physical_to_logical_apicid(hard_smp_processor_id())));
+	return (((value) & ~APIC_LDR_MASK) | SET_APIC_LOGICAL_ID(1UL << 
smp_processor_id()));
+}
+
 void __init setup_local_APIC (void)
 {
 	unsigned long value, ver, maxlvt;
@@ -304,21 +315,22 @@
 	 * document number 292116).  So here it goes...
 	 */
 
-	if (!clustered_apic_mode) {
+	if (!clustered_apic_numaq) {
 		/*
-		 * In clustered apic mode, the firmware does this for us 
-		 * Put the APIC into flat delivery mode.
-		 * Must be "all ones" explicitly for 82489DX.
+		 * For NUMA-Q, the firmware does this for us.  Otherwise, put the APIC into 
clustered or flat
+		 *
+		 * delivery mode.  Must be "all ones" explicitly for 82489DX.
 		 */
-		apic_write_around(APIC_DFR, 0xffffffff);
+		if (clustered_apic_mode)
+			apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
+		else
+			apic_write_around(APIC_DFR, APIC_DFR_FLAT);
 
 		/*
 		 * Set up the logical destination ID.
 		 */
 		value = apic_read(APIC_LDR);
-		value &= ~APIC_LDR_MASK;
-		value |= (1<<(smp_processor_id()+24));
-		apic_write_around(APIC_LDR, value);
+		apic_write_around(APIC_LDR, apic_ldr_value(value));
 	}
 
 	/*
diff -ruN 2.5.31/arch/i386/kernel/io_apic.c s31/arch/i386/kernel/io_apic.c
--- 2.5.31/arch/i386/kernel/io_apic.c	Sat Aug 10 18:41:26 2002
+++ s31/arch/i386/kernel/io_apic.c	Wed Aug 14 19:30:13 2002
@@ -35,6 +35,7 @@
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
+#include <asm/smpboot.h>
 
 #undef APIC_LOCKUP_DEBUG
 
@@ -261,7 +262,7 @@
 		allowed_mask = cpu_online_map & irq_affinity[irq];
 		entry->timestamp = now;
 		entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
-		set_ioapic_affinity(irq, 1 << entry->cpu);
+		set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));
 	}
 }
 #else /* !SMP */
@@ -682,9 +683,40 @@
 	return current_vector;
 }
 
+/*
+ * round_robin_cpu_apic_id -- Since i386 Linux doesn't use the APIC TPRs to
+ * set task/interrupt priority, xAPICs' tiebreaker rule tends to hit one CPU
+ * with all interrupts for each quad.  Distribute the interrupts using a
+ * simple round robin scheme.
+ */
+static int round_robin_cpu_apic_id(void)
+{
+	int val;
+	static unsigned	next_cpu = 0;
+
+	if (next_cpu >= NR_CPUS || cpu_2_logical_apicid[next_cpu] == BAD_APICID)
+		next_cpu = 0;
+	val = cpu_present_to_apicid(next_cpu) | APIC_DEST_CPUS_MASK;
+	++next_cpu;
+	return (val);
+}
+
+static inline int target_cpus(void)
+{
+	if (clustered_apic_numaq)
+		return APIC_BROADCAST_ID_APIC;	/* broadcast to local quad */
+	if (clustered_apic_xapic)
+		return round_robin_cpu_apic_id();
+	return logical_cpu_present_map & 0xFFu;
+//	return cpu_online_map;
+}
+
 static struct hw_interrupt_type ioapic_level_irq_type;
 static struct hw_interrupt_type ioapic_edge_irq_type;
 
+#undef KERN_DEBUG
+#define KERN_DEBUG
+
 void __init setup_IO_APIC_irqs(void)
 {
 	struct IO_APIC_route_entry entry;
@@ -702,9 +734,9 @@
 		memset(&entry,0,sizeof(entry));
 
 		entry.delivery_mode = dest_LowestPrio;
-		entry.dest_mode = INT_DELIVERY_MODE;
+		entry.dest_mode = INT_DEST_ADDR_MODE;
 		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest = TARGET_CPUS;
+		entry.dest.logical.logical_dest = target_cpus();
 
 		idx = find_irq_entry(apic,pin,mp_INT);
 		if (idx == -1) {
@@ -722,7 +754,6 @@
 		if (irq_trigger(idx)) {
 			entry.trigger = 1;
 			entry.mask = 1;
-			entry.dest.logical.logical_dest = TARGET_CPUS;
 		}
 
 		irq = pin_2_irq(idx, apic, pin);
@@ -782,9 +813,9 @@
 	 * We use logical delivery to get the timer IRQ
 	 * to the first CPU.
 	 */
-	entry.dest_mode = INT_DELIVERY_MODE;
+	entry.dest_mode = INT_DEST_ADDR_MODE;
 	entry.mask = 0;					/* unmask IRQ now */
-	entry.dest.logical.logical_dest = TARGET_CPUS;
+	entry.dest.logical.logical_dest = target_cpus();
 	entry.delivery_mode = dest_LowestPrio;
 	entry.polarity = 0;
 	entry.trigger = 0;
@@ -1141,7 +1172,7 @@
 		
 		old_id = mp_ioapics[apic].mpc_apicid;
 
-		if (mp_ioapics[apic].mpc_apicid >= 0xf) {
+		if (mp_ioapics[apic].mpc_apicid >= apic_broadcast_id) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
 				apic, mp_ioapics[apic].mpc_apicid);
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
@@ -1153,14 +1184,16 @@
 		 * Sanity check, is the ID really free? Every APIC in a
 		 * system must have a unique ID or we get lots of nice
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
+		 * I/O APIC IDs no longer have any meaning for xAPICs.
 		 */
-		if (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid)) {
+		if (!clustered_apic_xapic &&
+		    (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid))) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
 				apic, mp_ioapics[apic].mpc_apicid);
 			for (i = 0; i < 0xf; i++)
 				if (!(phys_id_present_map & (1 << i)))
 					break;
-			if (i >= 0xf)
+			if (i >= apic_broadcast_id)
 				panic("Max APIC ID exceeded!\n");
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
@@ -1288,7 +1321,7 @@
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
-	balance_irq(irq);
+//	balance_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1328,7 +1361,7 @@
 	unsigned long v;
 	int i;
 
-	balance_irq(irq);
+//	balance_irq(irq);
 /*
  * It appears there is an erratum which affects at least version 0x11
  * of I/O APIC (that's the 82093AA and cores integrated into various
@@ -1849,8 +1882,8 @@
 	memset(&entry,0,sizeof(entry));
 
 	entry.delivery_mode = dest_LowestPrio;
-	entry.dest_mode = INT_DELIVERY_MODE;
-	entry.dest.logical.logical_dest = TARGET_CPUS;
+	entry.dest_mode = INT_DEST_ADDR_MODE;
+	entry.dest.logical.logical_dest = target_cpus();
 	entry.mask = 1;					 /* Disabled (masked) */
 	entry.trigger = 1;				   /* Level sensitive */
 	entry.polarity = 1;					/* Low active */
diff -ruN 2.5.31/arch/i386/kernel/irq.c s31/arch/i386/kernel/irq.c
--- 2.5.31/arch/i386/kernel/irq.c	Sat Aug 10 18:41:19 2002
+++ s31/arch/i386/kernel/irq.c	Thu Aug 22 17:48:15 2002
@@ -332,6 +332,7 @@
 
 	irq_enter();
 	kstat.irqs[cpu][irq]++;
+	apic_adj_tpr(TPR_IRQ);
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
 	/*
@@ -389,6 +390,7 @@
 	 */
 	desc->handler->end(irq);
 	spin_unlock(&desc->lock);
+	apic_adj_tpr(-TPR_IRQ);
 
 	irq_exit();
 
diff -ruN 2.5.31/arch/i386/kernel/mpparse.c s31/arch/i386/kernel/mpparse.c
--- 2.5.31/arch/i386/kernel/mpparse.c	Sat Aug 10 18:41:25 2002
+++ s31/arch/i386/kernel/mpparse.c	Wed Aug 14 19:30:13 2002
@@ -30,6 +30,7 @@
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/io_apic.h>
+#include <asm/smpboot.h>
 
 /* Have we found an MP table */
 int smp_found_config;
@@ -68,6 +69,13 @@
 
 /* Bitmask of physically existing CPUs */
 unsigned long phys_cpu_present_map;
+unsigned long logical_cpu_present_map;
+
+u32 apic_broadcast_id = APIC_BROADCAST_ID_APIC;
+u8 clustered_apic_mode = 0;
+u8 esr_disable = 0;
+u8 raw_phys_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+static u8 clustered_hint = 0;
 
 /*
  * Intel MP BIOS table parsing routines:
@@ -104,8 +112,8 @@
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
 		return;
 
-	logical_apicid = m->mpc_apicid;
-	if (clustered_apic_mode) {
+	logical_apicid = 0x01;
+	if (clustered_apic_numaq) {
 		quad = translation_table[mpc_record]->trans_quad;
 		logical_apicid = (quad << 4) + 
 			(m->mpc_apicid ? m->mpc_apicid << 1 : 1);
@@ -186,11 +194,8 @@
 	}
 	ver = m->mpc_apicver;
 
-	if (clustered_apic_mode) {
-		phys_cpu_present_map |= (logical_apicid&0xf) << (4*quad);
-	} else {
-		phys_cpu_present_map |= 1 << m->mpc_apicid;
-	}
+	logical_cpu_present_map |= 1 << (num_processors-1);
+	phys_cpu_present_map |= apicid_to_phys_cpu_present(m->mpc_apicid);
 	/*
 	 * Validate version
 	 */
@@ -199,6 +204,7 @@
 		ver = 0x10;
 	}
 	apic_version[m->mpc_apicid] = ver;
+	raw_phys_apicid[num_processors - 1] = m->mpc_apicid;
 }
 
 static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -209,7 +215,7 @@
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 	
-	if (clustered_apic_mode) {
+	if (clustered_apic_numaq) {
 		quad = translation_table[mpc_record]->trans_quad;
 		mp_bus_id_to_node[m->mpc_busid] = quad;
 		mp_bus_id_to_local[m->mpc_busid] = 
translation_table[mpc_record]->trans_local;
@@ -253,6 +259,15 @@
 	}
 	mp_ioapics[nr_ioapics] = *m;
 	nr_ioapics++;
+	/******
+	 * Warning!  We have an APIC version number collision between the APICs
+	 * on Scorpio-based NUMA-Q boxes and Summit xAPICs.  Intel didn't
+	 * define the xAPIC ver ID range until late in the development cycle,
+	 * so there is working silicon out there that doesn't match it.
+	 * A test in smp_cluster_apic_check() resolves the above conflict.
+	 ******/
+	if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <= XAPIC_VER_HIGH)
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
 }
 
 static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
@@ -348,12 +363,39 @@
 }
 
 /*
+ * Common code for MPS and ACPI/MADT.
+ */
+void __init smp_cluster_apic_check(void)
+{
+	int i;
+	u8 cluster;
+	static const char *mode_names[] = {
+		"Flat", "Clustered NUMA-Q", "Clustered xAPIC", "???"
+	};
+
+	if (clustered_hint) {
+		if (clustered_hint & CLUSTERED_APIC_NUMAQ) {
+			/* NUMA-Q boxes never had xAPICs */
+			clustered_hint &= ~CLUSTERED_APIC_XAPIC;
+		}
+		clustered_apic_mode = clustered_hint;
+		esr_disable = 1;
+		if (clustered_apic_xapic)
+			apic_broadcast_id = APIC_BROADCAST_ID_XAPIC;
+		phys_cpu_present_map = logical_cpu_present_map;
+	}
+	printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
+		mode_names[clustered_apic_mode], nr_ioapics);
+}
+
+/*
  * Read/parse the MPC
  */
 
 static int __init smp_read_mpc(struct mp_config_table *mpc)
 {
-	char str[16];
+	char oem[10];
+	char prod[14];
 	int count=sizeof(*mpc);
 	unsigned char *mpt=((unsigned char *)mpc)+count;
 
@@ -378,13 +440,21 @@
 		printk(KERN_ERR "SMP mptable: null local APIC address!\n");
 		return 0;
 	}
-	memcpy(str,mpc->mpc_oem,8);
-	str[8]=0;
-	printk("OEM ID: %s ",str);
-
-	memcpy(str,mpc->mpc_productid,12);
-	str[12]=0;
-	printk("Product ID: %s ",str);
+	memcpy(oem, mpc->mpc_oem, 8);
+	oem[8] = 0;
+	memcpy(prod, mpc->mpc_productid, 12);
+	prod[12] = 0;
+	printk("OEM ID: %s ", oem);
+	printk("Product ID: %s ",prod);
+	/*
+	 * Can't recognize Summit xAPICs (see MP_ioapic_info), so use
+	 * OEM/Product IDs.
+	 */
+	if (!strncmp(oem, "IBM ENSW", 8) &&
+	    (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+	else if (!strncmp(oem, "IBM NUMA", 8))
+		clustered_hint |= CLUSTERED_APIC_NUMAQ;
 
 	printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
 
@@ -395,7 +465,7 @@
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
 
-	if (clustered_apic_mode && mpc->mpc_oemptr) {
+	if (clustered_apic_numaq && mpc->mpc_oemptr) {
 		/* We need to process the oem mpc tables to tell us which quad things are 
in ... */
 		mpc_record = 0;
 		smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, 
mpc->mpc_oemsize);
@@ -463,6 +533,7 @@
 		}
 		++mpc_record;
 	}
+	smp_cluster_apic_check();
 	if (!num_processors)
 		printk(KERN_ERR "SMP mptable: no processors registered!\n");
 	return num_processors;
@@ -934,6 +1005,17 @@
 		mp_ioapic_routing[idx].irq_start,
 		mp_ioapic_routing[idx].irq_end);
 
+	/******
+	 * Warning!  We have an APIC version number collision between the APICs
+	 * on Scorpio-based NUMA-Q boxes and Summit xAPICs.  Intel didn't
+	 * define the xAPIC ver ID range until late in the development cycle,
+	 * so there is working silicon out there that doesn't match it.
+	 * A test in smp_cluster_apic_check() resolves the above conflict.
+	 ******/
+	if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
+	    mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+
 	return;
 }
 
@@ -1051,6 +1133,13 @@
 	return;
 }
 
+/* Hook from generic ACPI tables.c */
+void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8))
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+}
+
 #ifdef CONFIG_ACPI_PCI
 
 void __init mp_parse_prt (void)
diff -ruN 2.5.31/arch/i386/kernel/process.c s31/arch/i386/kernel/process.c
--- 2.5.31/arch/i386/kernel/process.c	Sat Aug 10 18:41:15 2002
+++ s31/arch/i386/kernel/process.c	Wed Aug 14 19:30:13 2002
@@ -145,7 +145,9 @@
 		irq_stat[smp_processor_id()].idle_timestamp = jiffies;
 		while (!need_resched())
 			idle();
+		apic_set_tpr(TPR_TASK);
 		schedule();
+		apic_set_tpr(TPR_IDLE);
 	}
 }
 
@@ -197,7 +199,7 @@
 			}
 				/* we will leave sorting out the final value 
 				when we are ready to reboot, since we might not
- 				have set up boot_cpu_id or smp_num_cpu */
+ 				have set up boot_cpu_physical_apicid or smp_num_cpu */
 			break;
 #endif
 		}
diff -ruN 2.5.31/arch/i386/kernel/smpboot.c s31/arch/i386/kernel/smpboot.c
--- 2.5.31/arch/i386/kernel/smpboot.c	Sat Aug 10 18:41:28 2002
+++ s31/arch/i386/kernel/smpboot.c	Wed Aug 14 19:30:13 2002
@@ -498,59 +498,23 @@
 	return do_fork(CLONE_VM|CLONE_IDLETASK, 0, &regs, 0);
 }
 
-/* which physical APIC ID maps to which logical CPU number */
-volatile int physical_apicid_2_cpu[MAX_APICID];
 /* which logical CPU number maps to which physical APIC ID */
-volatile int cpu_2_physical_apicid[NR_CPUS];
+volatile u8 cpu_2_physical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID 
};
 
-/* which logical APIC ID maps to which logical CPU number */
-volatile int logical_apicid_2_cpu[MAX_APICID];
 /* which logical CPU number maps to which logical APIC ID */
-volatile int cpu_2_logical_apicid[NR_CPUS];
+volatile u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID 
};
 
-static inline void init_cpu_to_apicid(void)
-/* Initialize all maps between cpu number and apicids */
-{
-	int apicid, cpu;
-
-	for (apicid = 0; apicid < MAX_APICID; apicid++) {
-		physical_apicid_2_cpu[apicid] = -1;
-		logical_apicid_2_cpu[apicid] = -1;
-	}
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		cpu_2_physical_apicid[cpu] = -1;
-		cpu_2_logical_apicid[cpu] = -1;
-	}
-}
 
-static inline void map_cpu_to_boot_apicid(int cpu, int apicid)
-/* 
- * set up a mapping between cpu and apicid. Uses logical apicids for 
multiquad,
- * else physical apic ids
- */
+static inline void map_cpu_to_boot_apicid(int cpu, u8 phys_apicid, u8 
log_apicid)
 {
-	if (clustered_apic_mode) {
-		logical_apicid_2_cpu[apicid] = cpu;	
-		cpu_2_logical_apicid[cpu] = apicid;
-	} else {
-		physical_apicid_2_cpu[apicid] = cpu;	
-		cpu_2_physical_apicid[cpu] = apicid;
-	}
+	cpu_2_logical_apicid[cpu] = log_apicid;
+	cpu_2_physical_apicid[cpu] = phys_apicid;
 }
 
-static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid)
-/* 
- * undo a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
- */
+static inline void unmap_cpu_to_boot_apicid(int cpu, u8 phys_apicid, u8 
log_apicid)
 {
-	if (clustered_apic_mode) {
-		logical_apicid_2_cpu[apicid] = -1;	
-		cpu_2_logical_apicid[cpu] = -1;
-	} else {
-		physical_apicid_2_cpu[apicid] = -1;	
-		cpu_2_physical_apicid[cpu] = -1;
-	}
+	cpu_2_logical_apicid[cpu] = BAD_APICID;
+	cpu_2_physical_apicid[cpu] = BAD_APICID;
 }
 
 #if APIC_DEBUG
@@ -764,7 +728,7 @@
 
 extern unsigned long cpu_initialized;
 
-static void __init do_boot_cpu (int apicid) 
+static void __init do_boot_cpu(u8 phys_apicid, u8 log_apicid)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -774,7 +738,7 @@
 	unsigned long boot_error = 0;
 	int timeout, cpu;
 	unsigned long start_eip;
-	unsigned short nmi_high, nmi_low;
+	unsigned short nmi_high = 0, nmi_low = 0;
 
 	cpu = ++cpucount;
 	/*
@@ -791,7 +755,7 @@
 	 */
 	init_idle(idle, cpu);
 
-	map_cpu_to_boot_apicid(cpu, apicid);
+	map_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 
@@ -801,7 +765,8 @@
 	start_eip = setup_trampoline();
 
 	/* So we see what's up   */
-	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+	printk("Booting processor %d/0x%02X/0x%02X eip 0x%lX\n",
+		cpu, phys_apicid, log_apicid, start_eip);
 	stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle->thread_info);
 
 	/*
@@ -813,7 +778,7 @@
 
 	Dprintk("Setting warm reset code and vector.\n");
 
-	if (clustered_apic_mode) {
+	if (clustered_apic_numaq) {
 		/* stash the current NMI vector, so we can put things back */
 		nmi_high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
 		nmi_low = *((volatile unsigned short *) TRAMPOLINE_LOW);
@@ -830,7 +795,7 @@
 	/*
 	 * Be paranoid about clearing APIC errors.
 	 */
-	if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[apicid])) {
+	if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[phys_apicid])) {
 		apic_read_around(APIC_SPIV);
 		apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
@@ -845,10 +810,10 @@
 	 * Starting actual IPI sequence...
 	 */
 
-	if (clustered_apic_mode)
-		boot_error = wakeup_secondary_via_NMI(apicid);
-	else 
-		boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
+	if (clustered_apic_numaq)
+		boot_error = wakeup_secondary_via_NMI(log_apicid);
+	else
+		boot_error = wakeup_secondary_via_INIT(phys_apicid, start_eip);
 
 	if (!boot_error) {
 		/*
@@ -883,14 +848,15 @@
 				/* trampoline code not run */
 				printk("Not responding.\n");
 #if APIC_DEBUG
+			/* xAPICs don't do remote inquiries. */
 			if (!clustered_apic_mode)
-				inquire_remote_apic(apicid);
+				inquire_remote_apic(phys_apicid);
 #endif
 		}
 	}
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
-		unmap_cpu_to_boot_apicid(cpu, apicid);
+		unmap_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);
 		clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
 		clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
 		cpucount--;
@@ -899,7 +865,7 @@
 	/* mark "stuck" area as not stuck */
 	*((volatile unsigned long *)phys_to_virt(8192)) = 0;
 
-	if(clustered_apic_mode) {
+	if (clustered_apic_numaq) {
 		printk("Restoring NMI vector\n");
 		*((volatile unsigned short *) TRAMPOLINE_HIGH) = nmi_high;
 		*((volatile unsigned short *) TRAMPOLINE_LOW) = nmi_low;
@@ -958,7 +924,6 @@
 extern int prof_old_multiplier[NR_CPUS];
 extern int prof_counter[NR_CPUS];
 
-static int boot_cpu_logical_apicid;
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
@@ -966,9 +931,11 @@
 
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
-	int apicid, cpu, bit;
+	int cpu, bit;
+	u8 phys_apicid, log_apicid;
 
-        if (clustered_apic_mode && (numnodes > 1)) {
+#ifdef CONFIG_MULTIQUAD
+        if (clustered_apic_numaq && (numnodes > 1)) {
                 printk("Remapping cross-quad port I/O for %d quads\n",
 			numnodes);
                 printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
@@ -977,6 +944,7 @@
                 xquad_portio = ioremap (XQUAD_PORTIO_BASE, 
 			numnodes * XQUAD_PORTIO_LEN);
         }
+#endif
 
 #ifdef CONFIG_MTRR
 	/*  Must be done before other processors booted  */
@@ -993,8 +961,6 @@
 		prof_multiplier[cpu] = 1;
 	}
 
-	init_cpu_to_apicid();
-
 	/*
 	 * Setup boot CPU information
 	 */
@@ -1007,8 +973,14 @@
 	 */
 	set_bit(0, &cpu_online_map);
 	set_bit(0, &cpu_callout_map);
-	boot_cpu_logical_apicid = logical_smp_processor_id();
-	map_cpu_to_boot_apicid(0, boot_cpu_apicid);
+	if (clustered_apic_xapic)
+		boot_cpu_logical_apicid = 
physical_to_logical_apicid(boot_cpu_physical_apicid);
+	else if (clustered_apic_numaq)
+		boot_cpu_logical_apicid = logical_smp_processor_id();
+	else
+		boot_cpu_logical_apicid = 0x01;
+	map_cpu_to_boot_apicid(0, boot_cpu_physical_apicid, 
boot_cpu_logical_apicid);
+printk("Boot CPU #0/0x%02X/0x%02X\n", boot_cpu_physical_apicid, 
boot_cpu_logical_apicid);
 
 	current_thread_info()->cpu = 0;
 	smp_tune_scheduling();
@@ -1085,28 +1057,44 @@
 	 */
 	Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
 
-	for (bit = 0; bit < NR_CPUS; bit++) {
-		apicid = cpu_present_to_apicid(bit);
+	for (cpu = 1, bit = 0; bit < NR_CPUS; bit++) {
+		if (!(logical_cpu_present_map & (1ul << bit)))
+			continue;
+		if ((max_cpus >= 0) && (max_cpus <= cpucount + 1))
+			continue;
+		phys_apicid = raw_phys_apicid[bit];
 		/*
 		 * Don't even attempt to start the boot CPU!
 		 */
-		if (apicid == boot_cpu_apicid)
+		if (phys_apicid == boot_cpu_physical_apicid)
 			continue;
-
-		if (!(phys_cpu_present_map & (1 << bit)))
-			continue;
-		if (max_cpus <= cpucount+1)
+		if (phys_apicid == BAD_APICID)
 			continue;
+		if (clustered_apic_xapic)
+			log_apicid = (u8)physical_to_logical_apicid(phys_apicid);
+		else if (clustered_apic_numaq)
+			log_apicid = ((bit >> 2) << 4) | (1 << (bit & 0x3));
+		else {
+			/* Yes, this overflows if cpu > 7.  The APIC
+			 * destination register is only 8 bits wide.
+			 * For more than 8 CPUs, must use clustered mode. */
+			log_apicid = 1u << cpu;
+			if (log_apicid == 0)
+				BUG();
+		}
 
-		do_boot_cpu(apicid);
+		do_boot_cpu(phys_apicid, log_apicid);
 
 		/*
 		 * Make sure we unmap all failed CPUs
 		 */
-		if ((boot_apicid_to_cpu(apicid) == -1) &&
-				(phys_cpu_present_map & (1 << bit)))
-			printk("CPU #%d not responding - cannot use it.\n",
-								apicid);
+		if ((cpu_2_physical_apicid[cpu] == BAD_APICID) &&
+				(logical_cpu_present_map & (1ul << bit))) {
+			printk("CPU #%d/0x%02X/0x%02X not responding - cannot use it.\n",
+					bit, phys_apicid, log_apicid);
+			logical_cpu_present_map &= ~(1ul << bit);
+		} else
+			++cpu;		/* Got a live one. */
 	}
 
 	/*
diff -ruN 2.5.31/arch/i386/kernel/trampoline.S 
s31/arch/i386/kernel/trampoline.S
--- 2.5.31/arch/i386/kernel/trampoline.S	Sat Aug 10 18:41:27 2002
+++ s31/arch/i386/kernel/trampoline.S	Wed Aug 14 19:30:13 2002
@@ -36,9 +36,7 @@
 
 ENTRY(trampoline_data)
 r_base = .
-#ifdef CONFIG_MULTIQUAD
 	wbinvd
-#endif /* CONFIG_MULTIQUAD */
 	mov	%cs, %ax	# Code and data in the same place
 	mov	%ax, %ds
 
diff -ruN 2.5.31/include/asm-i386/apic.h s31/include/asm-i386/apic.h
--- 2.5.31/include/asm-i386/apic.h	Sat Aug 10 18:42:05 2002
+++ s31/include/asm-i386/apic.h	Wed Aug 14 19:31:11 2002
@@ -64,6 +64,22 @@
 	apic_write_around(APIC_EOI, 0);
 }
 
+static inline void apic_set_tpr(unsigned long val)
+{
+	unsigned long value;
+
+	value = apic_read(APIC_TASKPRI);
+	apic_write_around(APIC_TASKPRI, (value & ~APIC_TPRI_MASK) + val);
+}
+
+static inline void apic_adj_tpr(long adj)
+{
+	unsigned long value;
+
+	value = apic_read(APIC_TASKPRI);
+	apic_write_around(APIC_TASKPRI, value + adj);
+}
+
 extern int get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC (void);
@@ -96,6 +112,15 @@
 #define NMI_LOCAL_APIC	2
 #define NMI_INVALID	3
 
+#else /* CONFIG_X86_LOCAL_APIC */
+#define apic_set_tpr(val)
+#define apic_adj_tpr(adj)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
+/* Priority values for apic_adj_tpr() and apic_set_tpr() */
+/* xAPICs only do priority comparisons on the upper nibble. */
+#define TPR_IDLE	(0x00L)
+#define TPR_TASK	(0x10L)
+#define TPR_IRQ		(0x10L)
+
 #endif /* __ASM_APIC_H */
diff -ruN 2.5.31/include/asm-i386/apicdef.h s31/include/asm-i386/apicdef.h
--- 2.5.31/include/asm-i386/apicdef.h	Sat Aug 10 18:41:36 2002
+++ s31/include/asm-i386/apicdef.h	Wed Aug 14 19:30:13 2002
@@ -11,8 +11,10 @@
 #define		APIC_DEFAULT_PHYS_BASE	0xfee00000
  
 #define		APIC_ID		0x20
-#define			APIC_ID_MASK		(0x0F<<24)
-#define			GET_APIC_ID(x)		(((x)>>24)&0x0F)
+#define			APIC_ID_MASK		(0xFF<<24)
+#define			GET_APIC_ID(x)		(((x)>>24)&0xFF)
+#define				XAPIC_VER_LOW	0x14	/* Version num range */
+#define				XAPIC_VER_HIGH	0x1F
 #define		APIC_LVR	0x30
 #define			APIC_LVR_MASK		0xFF00FF
 #define			GET_APIC_VERSION(x)	((x)&0xFF)
@@ -32,6 +34,8 @@
 #define			SET_APIC_LOGICAL_ID(x)	(((x)<<24))
 #define			APIC_ALL_CPUS		0xFF
 #define		APIC_DFR	0xE0
+#define			APIC_DFR_CLUSTER	0x0FFFFFFFul	/* Clustered */
+#define			APIC_DFR_FLAT		0xFFFFFFFFul	/* Flat mode */
 #define		APIC_SPIV	0xF0
 #define			APIC_SPIV_FOCUS_DISABLED	(1<<9)
 #define			APIC_SPIV_APIC_ENABLED		(1<<8)
@@ -58,6 +62,7 @@
 #define			APIC_INT_ASSERT		0x04000
 #define			APIC_ICR_BUSY		0x01000
 #define			APIC_DEST_LOGICAL	0x00800
+#define				APIC_DEST_PHYSICAL	0x0	/* For symmetry */
 #define			APIC_DM_FIXED		0x00000
 #define			APIC_DM_LOWEST		0x00100
 #define			APIC_DM_SMI		0x00200
@@ -108,7 +113,13 @@
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 
-#define MAX_IO_APICS 8
+#define MAX_IO_APICS 32	/* Summit boxes can have 4*(2+3*2) I/O APICs */
+
+/*
+ * The intr broadcast ID is 0xF for old APICs and 0xFF for xAPICs.
+ */
+#define APIC_BROADCAST_ID_XAPIC	0xFF
+#define APIC_BROADCAST_ID_APIC	0x0F
 
 /*
  * the local APIC register structure, memory mapped. Not terribly well
diff -ruN 2.5.31/include/asm-i386/mpspec.h s31/include/asm-i386/mpspec.h
--- 2.5.31/include/asm-i386/mpspec.h	Sat Aug 10 18:41:16 2002
+++ s31/include/asm-i386/mpspec.h	Wed Aug 14 19:30:13 2002
@@ -14,13 +14,10 @@
 #define SMP_MAGIC_IDENT	(('_'<<24)|('P'<<16)|('M'<<8)|'_')
 
 /*
- * a maximum of 16 APICs with the current APIC ID architecture.
+ * A maximum of 16 APICs with the classic APIC ID architecture.
+ * xAPICs can have up to 256.
  */
-#ifdef CONFIG_MULTIQUAD
 #define MAX_APICS 256
-#else /* !CONFIG_MULTIQUAD */
-#define MAX_APICS 16
-#endif /* CONFIG_MULTIQUAD */
 
 #define MAX_MPC_ENTRY 1024
 
@@ -204,6 +201,7 @@
 extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
 
 extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int boot_cpu_logical_apicid;
 extern unsigned long phys_cpu_present_map;
 extern int smp_found_config;
 extern void find_smp_config (void);
diff -ruN 2.5.31/include/asm-i386/smp.h s31/include/asm-i386/smp.h
--- 2.5.31/include/asm-i386/smp.h	Sat Aug 10 18:41:18 2002
+++ s31/include/asm-i386/smp.h	Wed Aug 14 19:30:13 2002
@@ -19,33 +19,56 @@
 #include <asm/io_apic.h>
 #endif
 #include <asm/apic.h>
-#endif
-#endif
+#endif /* !__ASSEMBLY__ */
+#endif /* CONFIG_X86_LOCAL_APIC */
 
-#ifdef CONFIG_SMP
-# ifdef CONFIG_MULTIQUAD
-#  define TARGET_CPUS 0xf     /* all CPUs in *THIS* quad */
-#  define INT_DELIVERY_MODE 0     /* physical delivery on LOCAL quad */
-# else
-#  define TARGET_CPUS cpu_online_map
-#  define INT_DELIVERY_MODE 1     /* logical delivery broadcast to all procs 
*/
-# endif
-#else
-# define INT_DELIVERY_MODE 1     /* logical delivery */
-# define TARGET_CPUS 0x01
-#endif
+#ifndef __ASSEMBLY__
+extern u8 clustered_apic_mode;
+extern u8 esr_disable;
+extern u32 apic_broadcast_id;
+extern unsigned long logical_cpu_present_map;
+extern unsigned long phys_cpu_present_map;
+
+/*
+ * Some lowlevel functions might want to know about
+ * the real APIC ID <-> CPU # mapping.
+ */
+#define MAX_APICID 256
+#define BAD_APICID 0xFFu
+extern volatile u8 cpu_2_physical_apicid[NR_CPUS];
+extern volatile u8 physical_apicid_2_cpu[MAX_APICID];
+extern volatile u8 cpu_2_logical_apicid[NR_CPUS];
+extern volatile u8 logical_apicid_2_cpu[MAX_APICID];
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+
+#ifndef CONFIG_X86_LOCAL_APIC
+
+#define clustered_apic_mode	(0)
+#define esr_disable		(0)
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#endif /* !__ASSEMBLY__ */
+
+#define CLUSTERED_APIC_NUMAQ	0x01
+#define CLUSTERED_APIC_XAPIC	0x02
+
+#define clustered_apic_numaq	(clustered_apic_mode & CLUSTERED_APIC_NUMAQ)
+#define clustered_apic_xapic	(clustered_apic_mode & CLUSTERED_APIC_XAPIC)
+
+#define APIC_DEST_CPUS_MASK	0x0Fu	/* Destination masks for */
+#define APIC_DEST_CLUSTER_MASK	0xF0u	/* clustered mode. */
+#define INT_DEST_ADDR_MODE	1	/* logical delivery */
 
-#ifndef clustered_apic_mode
- #ifdef CONFIG_MULTIQUAD
-  #define clustered_apic_mode (1)
-  #define esr_disable (1)
- #else /* !CONFIG_MULTIQUAD */
-  #define clustered_apic_mode (0)
-  #define esr_disable (0)
- #endif /* CONFIG_MULTIQUAD */
-#endif 
 
 #ifdef CONFIG_SMP
+#define smp_processor_id() (current->processor)
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -53,7 +76,6 @@
  */
  
 extern void smp_alloc_memory(void);
-extern unsigned long phys_cpu_present_map;
 extern unsigned long cpu_online_map;
 extern volatile unsigned long smp_invalidate_needed;
 extern int pic_mode;
@@ -69,16 +91,6 @@
 extern void zap_low_mappings (void);
 
 /*
- * Some lowlevel functions might want to know about
- * the real APIC ID <-> CPU # mapping.
- */
-#define MAX_APICID 256
-extern volatile int cpu_to_physical_apicid[NR_CPUS];
-extern volatile int physical_apicid_to_cpu[MAX_APICID];
-extern volatile int cpu_to_logical_apicid[NR_CPUS];
-extern volatile int logical_apicid_to_cpu[MAX_APICID];
-
-/*
  * This function is needed by all SMP systems. It must _always_ be valid
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
@@ -123,7 +135,7 @@
 
 #endif /* !__ASSEMBLY__ */
 
-#define NO_PROC_ID		0xFF		/* No processor magic marker */
+#define NO_PROC_ID		0xFFu		/* No processor magic marker */
 
-#endif
-#endif
+#endif /* CONFIG_SMP */
+#endif /* __ASM_SMP_H */
diff -ruN 2.5.31/include/asm-i386/smpboot.h s31/include/asm-i386/smpboot.h
--- 2.5.31/include/asm-i386/smpboot.h	Sat Aug 10 18:41:55 2002
+++ s31/include/asm-i386/smpboot.h	Wed Aug 14 19:30:13 2002
@@ -1,62 +1,50 @@
 #ifndef __ASM_SMPBOOT_H
 #define __ASM_SMPBOOT_H
 
-#ifndef clustered_apic_mode
- #ifdef CONFIG_MULTIQUAD
-  #define clustered_apic_mode (1)
- #else /* !CONFIG_MULTIQUAD */
-  #define clustered_apic_mode (0)
- #endif /* CONFIG_MULTIQUAD */
-#endif 
- 
-#ifdef CONFIG_MULTIQUAD
- #define TRAMPOLINE_LOW phys_to_virt(0x8)
- #define TRAMPOLINE_HIGH phys_to_virt(0xa)
-#else /* !CONFIG_MULTIQUAD */
- #define TRAMPOLINE_LOW phys_to_virt(0x467)
- #define TRAMPOLINE_HIGH phys_to_virt(0x469)
-#endif /* CONFIG_MULTIQUAD */
-
-#ifdef CONFIG_MULTIQUAD
- #define boot_cpu_apicid boot_cpu_logical_apicid
-#else /* !CONFIG_MULTIQUAD */
- #define boot_cpu_apicid boot_cpu_physical_apicid
-#endif /* CONFIG_MULTIQUAD */
+#ifndef __ASM_SMP_H
+#include "asm/smp.h"
+#endif
+
+#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
+#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)
+
+//#define boot_cpu_apicid 
(clustered_apic_numaq?boot_cpu_logical_apicid:boot_cpu_physical_apicid)
+
+/*
+ * To build the logical APIC ID for each CPU we have three cases:
+ *  1) Normal flat mode:  use a bitmap of the CPU numbers
+ *  2) NUMA-Q:  do nothing, the BIOS has set it up
+ *  3) xAPIC:  convert the Intel standard physical APIC ID to a cluster
+ *	nibble/cpu bitmap nibble
+ */
+/* cpu index numbr:  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, ... */
+/* phys xAPIC IDs : 00, 01, 02, 03, 10, 11, 12, 13, 20, 21, 22, ... */
+/* logical APIC ID: 01, 02, 04, 08, 11, 12, 14, 18, 21, 22, 24, ... */
+#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) | 
((phys_apic) & APIC_DEST_CLUSTER_MASK))
 
 /*
- * How to map from the cpu_present_map
+ * How to map from phys_cpu_present_map.
+ *  1) Normal flat mode:  use the mps_cpu, apicid bitmap
+ *  2) Multi-Quad:  only 4 CPUs per cluster, cluster ID in high nibble
  */
-#ifdef CONFIG_MULTIQUAD
- #define cpu_present_to_apicid(mps_cpu) ( ((mps_cpu/4)*16) + (1<<(mps_cpu%4)) 
)
-#else /* !CONFIG_MULTIQUAD */
- #define cpu_present_to_apicid(apicid) (apicid)
-#endif /* CONFIG_MULTIQUAD */
+#if 1
+#define cpu_present_to_apicid(cpu)	(cpu_to_logical_apicid(cpu))
+#else
+#define cpu_present_to_apicid(mps_cpu)	(clustered_apic_numaq ? \
+ 	( (((u32)(mps_cpu) >> 2) << 4) + (1u << ((mps_cpu) & 0x3)) ) : \
+	 (clustered_apic_xapic ? cpu_to_logical_apicid(mps_cpu) : 1u << (mps_cpu) ) 
)
+#endif
+extern unsigned char raw_phys_apicid[NR_CPUS];
+#define apicid_to_phys_cpu_present(apicid)	(clustered_apic_mode ? (1ul << 
((((apicid) >> 4) << 2) | ((apicid) & 0x3))) : (1ul << (apicid)))
 
 /*
  * Mappings between logical cpu number and logical / physical apicid
- * The first four macros are trivial, but it keeps the abstraction consistent
  */
-extern volatile int logical_apicid_2_cpu[];
-extern volatile int cpu_2_logical_apicid[];
-extern volatile int physical_apicid_2_cpu[];
-extern volatile int cpu_2_physical_apicid[];
-
-#define logical_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_logical_apicid(cpu) cpu_2_logical_apicid[cpu]
-#define physical_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_physical_apicid(cpu) cpu_2_physical_apicid[cpu]
-#ifdef CONFIG_MULTIQUAD			/* use logical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_logical_apicid[cpu]
-#else /* !CONFIG_MULTIQUAD */		/* use physical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_physical_apicid[cpu]
-#endif /* CONFIG_MULTIQUAD */
-
-
-#ifdef CONFIG_MULTIQUAD
-#else /* !CONFIG_MULTIQUAD */
-#endif /* CONFIG_MULTIQUAD */
+extern volatile u8 cpu_2_logical_apicid[];
+extern volatile u8 cpu_2_physical_apicid[];
+
+#define cpu_to_logical_apicid(cpu)	(int)cpu_2_logical_apicid[cpu]
+#define cpu_to_physical_apicid(cpu)	(int)cpu_2_physical_apicid[cpu]
 
 
 #endif


-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
       [not found]     ` <200208221931.35052.jamesclv@us.ibm.com.suse.lists.linux.kernel>
@ 2002-08-23  7:11       ` Andi Kleen
  2002-08-23  8:48         ` William Lee Irwin III
                           ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Andi Kleen @ 2002-08-23  7:11 UTC (permalink / raw)
  To: James Cleverdon; +Cc: linux-kernel

James Cleverdon <jamesclv@us.ibm.com> writes:


Some review.

> diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
> --- 2.5.31/arch/i386/kernel/acpi.c	Sat Aug 10 18:41:53 2002
> +++ s31/arch/i386/kernel/acpi.c	Wed Aug 14 19:30:13 2002
> @@ -114,6 +114,7 @@
>  	unsigned long		size)
>  {
>  	struct acpi_table_madt	*madt = NULL;
> +	extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);

This should be moved to acpi.h

>  {
>  	int			result = 0;
> +	extern void		smp_cluster_apic_check(void);

And smp.h

> -		set_ioapic_affinity(irq, 1 << entry->cpu);
> +		set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));

and cpu_present_to_apicid()

> +#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) | 
> ((phys_apic) & APIC_DEST_CLUSTER_MASK))

which is not equivalent for more than four CPUs and not using 
clustered mode. Are you sure this is correct? One of these must be wrong 
then, either the old or the new code.


> + * with all interrupts for each quad.  Distribute the interrupts using a
> + * simple round robin scheme.
> + */
> +static int round_robin_cpu_apic_id(void)
> +{
> +	int val;
> +	static unsigned	next_cpu = 0;

This is not protected by any global lock. Are you sure this is ok ?

> @@ -1288,7 +1321,7 @@
>   */
>  static void ack_edge_ioapic_irq(unsigned int irq)
>  {
> -	balance_irq(irq);
> +//	balance_irq(irq);

I would get rid of it completely. 

Doing the TPR change is certainly very involved - testing that on 
a lot of different SMP machines will be definitely needed. I think
it is the right way to go I agree, balance_irq always looked fishy to
me, especially with HyperThreading. How even is the distribution of the 
interrupts under load? Did you test it with Intel chipset P4s ?
Is this mode implemented on all APICs ?
Do you have any thoughts on this scheme on how this interacts with
HyperThreading ?

> @@ -332,6 +332,7 @@
>  
>  	irq_enter();
>  	kstat.irqs[cpu][irq]++;
> +	apic_adj_tpr(TPR_IRQ);
>  	spin_lock(&desc->lock);
>  	desc->handler->ack(irq);
>  	/*
> @@ -389,6 +390,7 @@
>  	 */
>  	desc->handler->end(irq);
>  	spin_unlock(&desc->lock);
> +	apic_adj_tpr(-TPR_IRQ);

It may make sense to it raised over softirqs as well.
This is a bit tricky because they are called from the entry.S 
assembly. It may make sense to raise it again using some asm/ defined
macros in kernel/softirq.c. If not a CPU mostly processing softirqs 
will be marked idle in the idle loop, which is not good.


> translation_table[mpc_record]->trans_local;
> @@ -253,6 +259,15 @@
>  	}
>  	mp_ioapics[nr_ioapics] = *m;
>  	nr_ioapics++;
> +	/******
> +	 * Warning!  We have an APIC version number collision between the APICs
> +	 * on Scorpio-based NUMA-Q boxes and Summit xAPICs.  Intel didn't
> +	 * define the xAPIC ver ID range until late in the development cycle,
> +	 * so there is working silicon out there that doesn't match it.
> +	 * A test in smp_cluster_apic_check() resolves the above conflict.
> +	 ******/
> +	if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <= XAPIC_VER_HIGH)
> +		clustered_hint |= CLUSTERED_APIC_XAPIC;
>  }

This looks risky in the general case. Can't you wrap it with some special
check to make sure it only ever triggers on your hardware?

> +	 * OEM/Product IDs.
> +	 */
> +	if (!strncmp(oem, "IBM ENSW", 8) &&
> +	    (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
> +		clustered_hint |= CLUSTERED_APIC_XAPIC;
> +	else if (!strncmp(oem, "IBM NUMA", 8))
> +		clustered_hint |= CLUSTERED_APIC_NUMAQ;

[I'm surprised you are not using ACPI for this on your boxes]


> +	 * A test in smp_cluster_apic_check() resolves the above conflict.
> +	 ******/
> +	if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
> +	    mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
> +		clustered_hint |= CLUSTERED_APIC_XAPIC;

Same as above.

> +#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
> +#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)

Ugly. I would use some global for this that is changed by the clustered
apic init code.

Also you could get rid of all the // and #if 1/#if 0


-Andi

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
  2002-08-23  7:11       ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing Andi Kleen
@ 2002-08-23  8:48         ` William Lee Irwin III
  2002-08-23 14:12         ` Martin J. Bligh
  2002-08-23 21:36         ` James Cleverdon
  2 siblings, 0 replies; 8+ messages in thread
From: William Lee Irwin III @ 2002-08-23  8:48 UTC (permalink / raw)
  To: Andi Kleen; +Cc: James Cleverdon, linux-kernel

James Cleverdon <jamesclv@us.ibm.com> writes:
+#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) | ((phys_apic) & APIC_DEST_CLUSTER_MASK))

On Fri, Aug 23, 2002 at 09:11:54AM +0200, Andi Kleen wrote:
> which is not equivalent for more than four CPUs and not using 
> clustered mode. Are you sure this is correct? One of these must be wrong 
> then, either the old or the new code.

IIRC there are some oddities. Figures 7-2 and 7-5 in the P-IV vol3
describe 3 different layouts for MP table APIC ID specifications:

(1) APIC ID format for Xeon processors without HyperThreading
	[1:2]:	processor ID
	[3:4]:	cluster ID

(2) APIC ID format for P6 family processors
	[0:1]:	processor ID
	[2:3]:	cluster ID

(3) APIC ID format for Hyperthreaded processors
	[0:0]:	logical processor ID
	[1:2]:	package ID
	[3:4]:	cluster ID

.. where any bits not specified are reserved. These are as they appear
in the MP table. As destinations in the clustered hierarchical model,
the cluster ID always resides in the upper nybble, and the remainder of
the ID in the lower nybble as a bitmask. So the physical/logic
conversion above is valid for xAPIC's, where the physical:logical
correspondence of destination APIC ID's is such. For NUMA-Q the
physical APIC ID space was not large enough to hold all cpus at once
and so cpus do not have unique physical APIC ID's at all, nor do
IO-APIC's. The physical APIC ID spaces of different nodes are entirely
disjoint, and so the only flaw I see here is that the apic_broadcast_id
is not a suitable criterion for IO-APIC physical ID renumbering on
NUMA-Q (and AFAIK it's entirely unnecessary there also). This bug is
shared with mainline, which panics given a sufficient number of IO-APICs.

The macro above is only used in the case clustered_apic_xapic, and so
doesn't need checking for case (2). Only 4 cpus/cluster are allowable,
so the assumption is that a physical APIC ID is tagged with the cluster
using the same bits as logical APIC ID's. For clustered_apic_xapic this
is the case, for NUMA-Q it is not, and that shifts the cluster ID left
2 bits appropriately in macros conditional on clustered_apic_numaq.

Or so my analysis of it goes.


James Cleverdon <jamesclv@us.ibm.com> writes:
+	 * OEM/Product IDs.
+	 */
+	if (!strncmp(oem, "IBM ENSW", 8) &&
+	    (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
+		clustered_hint |= CLUSTERED_APIC_XAPIC;
+	else if (!strncmp(oem, "IBM NUMA", 8))
+		clustered_hint |= CLUSTERED_APIC_NUMAQ;

On Fri, Aug 23, 2002 at 09:11:54AM +0200, Andi Kleen wrote:
> [I'm surprised you are not using ACPI for this on your boxes]

IBM NUMA == NUMA-Q. AFAIK they were released well prior to any remotely
usable ACPI specifications. The QCT table, which encoded information
similar to various proposed NUMA-ish ACPI tables, was kept as an MP OEM
table by the NUMA-Q BIOS.


Cheers,
Bill

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
  2002-08-23  7:11       ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing Andi Kleen
  2002-08-23  8:48         ` William Lee Irwin III
@ 2002-08-23 14:12         ` Martin J. Bligh
  2002-08-23 21:36         ` James Cleverdon
  2 siblings, 0 replies; 8+ messages in thread
From: Martin J. Bligh @ 2002-08-23 14:12 UTC (permalink / raw)
  To: Andi Kleen, James Cleverdon; +Cc: linux-kernel

> Doing the TPR change is certainly very involved - testing that on 
> a lot of different SMP machines will be definitely needed. I think
> it is the right way to go I agree, balance_irq always looked fishy to
> me, especially with HyperThreading. 

The one advantage it would seem to have is cache warmth for the
interrupt processor - some stickiness is good. But I think using
idle CPUs properly is more important. I don't think an explicit
IO apic programming method can do this fast enough without being
horribly inefficient in terms of constantly reprogramming things.

> How even is the distribution of the interrupts under load? 

Do you really care? I fail to understand why this is a goal for
people. Pretty numbers in /proc/interrupts are meaningless ...
what we really want is to direct interrupts to CPUs where they
can be efficiently processed. That means idle cpus, or cpus with
cache context (warmth) in some form, whether that be for the int
processing code, or the task the interrupt's data is really 
destined for (very hard to determine). 

If they all end up on one CPU because that just happens to be 
efficient, so be it. There was some concern at one point about
timer irq's not being distributed which I don't understand the
problem with, but let's deal with that seperately if necessary.

> [I'm surprised you are not using ACPI for this on your boxes]

We don't have ACPI on all our boxes ... some of us are happy
about that ;-)

M.


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
  2002-08-23  7:11       ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing Andi Kleen
  2002-08-23  8:48         ` William Lee Irwin III
  2002-08-23 14:12         ` Martin J. Bligh
@ 2002-08-23 21:36         ` James Cleverdon
  2 siblings, 0 replies; 8+ messages in thread
From: James Cleverdon @ 2002-08-23 21:36 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel

On Friday 23 August 2002 12:11 am, Andi Kleen wrote:
> James Cleverdon <jamesclv@us.ibm.com> writes:
>
>
> Some review.

Thanks for the review.  Comments below.

> > diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
> > --- 2.5.31/arch/i386/kernel/acpi.c	Sat Aug 10 18:41:53 2002
> > +++ s31/arch/i386/kernel/acpi.c	Wed Aug 14 19:30:13 2002
> > @@ -114,6 +114,7 @@
> >  	unsigned long		size)
> >  {
> >  	struct acpi_table_madt	*madt = NULL;
> > +	extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);
>
> This should be moved to acpi.h

Will be, once I'm sure this is the right way to go.  As mentioned earlier, I'm 
having ACPI problems that seem to imply ACPI isn't building the full IRQ 
table.  In 2.4 we could let MPS do this.  Maybe 2.5 will need to revert to 
that behavior.

> >  {
> >  	int			result = 0;
> > +	extern void		smp_cluster_apic_check(void);
>
> And smp.h

Likewise.

> > -		set_ioapic_affinity(irq, 1 << entry->cpu);
> > +		set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));
>
> and cpu_present_to_apicid()
>
> > +#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) &
> > 0x3)) | ((phys_apic) & APIC_DEST_CLUSTER_MASK))
>
> which is not equivalent for more than four CPUs and not using
> clustered mode. Are you sure this is correct? One of these must be wrong
> then, either the old or the new code.

There are several APIC numbering schemes here:

1) Classic Flat mode.  Almost anything goes, and we've seen some rather wacky 
assignments by oddball BIOSes.  We assign logical APIC IDs in CPU on-line 
order.

2) NUMA-Q.  We can take some shortcuts because we know _exactly_ how the BIOS 
is going to assign physical and logical APIC IDs.  In fact, the BIOS has 
already set it all up, so just need to let the kernel know.

3) Parallel xAPIC.  (Serial xAPIC can be treated as Flat for <= 8 CPUs).  
Intel has defined a particular physical APIC numbering scheme to include 
hyperthreading, so we can easily generate a unique logical ID from it.  This 
is the value produced by physical_to_logical_apicid().  Maybe I should pick a 
more descriptive name, like xapic_physical_to_logical_apicid.  ;^)

> > + * with all interrupts for each quad.  Distribute the interrupts using a
> > + * simple round robin scheme.
> > + */
> > +static int round_robin_cpu_apic_id(void)
> > +{
> > +	int val;
> > +	static unsigned	next_cpu = 0;
>
> This is not protected by any global lock. Are you sure this is ok ?

Yes, it's done by the boot CPU only.  Not that it matters; lacking any 
standard I/O bus to CPU locality table, I can only assign IRQs to APIC 
clusters randomly.

> > @@ -1288,7 +1321,7 @@
> >   */
> >  static void ack_edge_ioapic_irq(unsigned int irq)
> >  {
> > -	balance_irq(irq);
> > +//	balance_irq(irq);
>
> I would get rid of it completely.

Zounds!  You have uncovered my diabolical plot -- to make balance_irq 
unnecessary!  Curses, foiled again!    8^)

> Doing the TPR change is certainly very involved - testing that on
> a lot of different SMP machines will be definitely needed. I think
> it is the right way to go I agree, balance_irq always looked fishy to
> me, especially with HyperThreading. How even is the distribution of the
> interrupts under load? Did you test it with Intel chipset P4s ?
> Is this mode implemented on all APICs ?
> Do you have any thoughts on this scheme on how this interacts with
> HyperThreading ?

Yes, I've given quite a bit of thought to how this code and hyperthreading get 
along.  Since all schedulers since 2.4.14 dispatch tasks to sibling 
processors last, they will take the bulk of the interrupt traffic on a medium 
loaded system.  This shouldn't be a problem, since the siblings have their 
own local APICs.

In my testing, the distribution of interrupts under load begins to approach 
even.  (Not that it matters much so long as we are targeting idle CPUs.)  
But, I've been running the chat benchmark between two x440s almost 
exclusively, to maximize CPU and interrupt load.  Doubtless other job mixes 
will produce different results.  Andrew Theurer has done a bit of basic 
sanity testing too.  If I can get summit and 2.5 working with hyperthreading, 
it is slated for lots more testing.

Yes, we'll have to exercise this code with lots more oddball SMP systems.  My 
site is rather limited on test hardware.  It is almost entirely NUMA-Q, 
Summit, or stock Netfinity.

> > @@ -332,6 +332,7 @@
> >
> >  	irq_enter();
> >  	kstat.irqs[cpu][irq]++;
> > +	apic_adj_tpr(TPR_IRQ);
> >  	spin_lock(&desc->lock);
> >  	desc->handler->ack(irq);
> >  	/*
> > @@ -389,6 +390,7 @@
> >  	 */
> >  	desc->handler->end(irq);
> >  	spin_unlock(&desc->lock);
> > +	apic_adj_tpr(-TPR_IRQ);
>
> It may make sense to it raised over softirqs as well.
> This is a bit tricky because they are called from the entry.S
> assembly. It may make sense to raise it again using some asm/ defined
> macros in kernel/softirq.c. If not a CPU mostly processing softirqs
> will be marked idle in the idle loop, which is not good.

Good idea.  I'll take a look at that.

> > translation_table[mpc_record]->trans_local;
> > @@ -253,6 +259,15 @@
> >  	}
> >  	mp_ioapics[nr_ioapics] = *m;
> >  	nr_ioapics++;
> > +	/******
> > +	 * Warning!  We have an APIC version number collision between the APICs
> > +	 * on Scorpio-based NUMA-Q boxes and Summit xAPICs.  Intel didn't
> > +	 * define the xAPIC ver ID range until late in the development cycle,
> > +	 * so there is working silicon out there that doesn't match it.
> > +	 * A test in smp_cluster_apic_check() resolves the above conflict.
> > +	 ******/
> > +	if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <=
> > XAPIC_VER_HIGH) +		clustered_hint |= CLUSTERED_APIC_XAPIC;
> >  }
>
> This looks risky in the general case. Can't you wrap it with some special
> check to make sure it only ever triggers on your hardware?

That's already done by the code fragment below.  If folks think the xAPIC 
version range test above is too dangerous, it can easily removed.

> > +	 * OEM/Product IDs.
> > +	 */
> > +	if (!strncmp(oem, "IBM ENSW", 8) &&
> > +	    (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
> > +		clustered_hint |= CLUSTERED_APIC_XAPIC;
> > +	else if (!strncmp(oem, "IBM NUMA", 8))
> > +		clustered_hint |= CLUSTERED_APIC_NUMAQ;
>
> [I'm surprised you are not using ACPI for this on your boxes]

ACPI was not, is not, and will never be available for NUMA-Q.  NUMA-Q was 
released long before ACPI was hatched.  There is a permanent feature freeze 
on NUMA-Q firmware.  Only bugs are fixed.  (Maybe.  The firmware folks got 
axed by the latest layoff....)

Thus, we have to make this work with MPS.  That's a good idea in any case.

> > +	 * A test in smp_cluster_apic_check() resolves the above conflict.
> > +	 ******/
> > +	if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
> > +	    mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
> > +		clustered_hint |= CLUSTERED_APIC_XAPIC;
>
> Same as above.

Ditto.

> > +#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
> > +#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)
>
> Ugly. I would use some global for this that is changed by the clustered
> apic init code.

That's straight from Martin's code, already in the base.  Only the names were 
changed to protect the guilty.   8^)

> Also you could get rid of all the // and #if 1/#if 0

Yup.  Was left in to show my uncertainty about the IRQ weirdness.  Oh, and to 
flag balance_irq.

> -Andi

Thanks again for the review.  I appreciate it.

-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
@ 2002-08-24  0:29 Grover, Andrew
  2002-08-26  1:59 ` James Cleverdon
  0 siblings, 1 reply; 8+ messages in thread
From: Grover, Andrew @ 2002-08-24  0:29 UTC (permalink / raw)
  To: 'jamesclv@us.ibm.com', Andi Kleen; +Cc: linux-kernel

> From: James Cleverdon [mailto:jamesclv@us.ibm.com] 
> > This should be moved to acpi.h
> 
> Will be, once I'm sure this is the right way to go.  As 
> mentioned earlier, I'm 
> having ACPI problems that seem to imply ACPI isn't building 
> the full IRQ 
> table.  In 2.4 we could let MPS do this.  Maybe 2.5 will need 
> to revert to 
> that behavior.

What happens when you use the FULL ACPI support? I suspect that you really
do want the interpreter, in order to evaluate _PRTs properly.

ISTR that the reason you are thinking that ACPI only is programming some of
the ioapic entries is because whatever is printing them is looking at the
mp_irqs array. Which is MPS specific. So ACPI doesn't bother filling it all
in. :)

Is that a bug? Should ACPI fill it in completely, or maybe not at all? Don't
know. But it is strictly unnecessary.

Regards -- Andy

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
  2002-08-24  0:29 Grover, Andrew
@ 2002-08-26  1:59 ` James Cleverdon
  0 siblings, 0 replies; 8+ messages in thread
From: James Cleverdon @ 2002-08-26  1:59 UTC (permalink / raw)
  To: Grover, Andrew, Andi Kleen; +Cc: linux-kernel

On Friday 23 August 2002 05:29 pm, Grover, Andrew wrote:
> > From: James Cleverdon [mailto:jamesclv@us.ibm.com]
> >
> > > This should be moved to acpi.h
> >
> > Will be, once I'm sure this is the right way to go.  As
> > mentioned earlier, I'm
> > having ACPI problems that seem to imply ACPI isn't building
> > the full IRQ
> > table.  In 2.4 we could let MPS do this.  Maybe 2.5 will need
> > to revert to
> > that behavior.
>
> What happens when you use the FULL ACPI support? I suspect that you really
> do want the interpreter, in order to evaluate _PRTs properly.
>
> ISTR that the reason you are thinking that ACPI only is programming some of
> the ioapic entries is because whatever is printing them is looking at the
> mp_irqs array. Which is MPS specific. So ACPI doesn't bother filling it all
> in. :)
>
> Is that a bug? Should ACPI fill it in completely, or maybe not at all?
> Don't know. But it is strictly unnecessary.
>
> Regards -- Andy

Bingo!  With full ACPI turned on, the system does indeed boot.  The extra I/O 
APIC entries are being programmed from the PRT.

(Call chain is:  pci_acpi_init --> acpi_pci_irq_init --> mp_parse_prt --> 
io_apic_set_pci_routing)

So, given that quite a number of our customers would like to run with 
hyperthreading turned on, but do not want full ACPI, what is the right thing 
to do in the HT-only case?  Add extra code to process the PRT?  Fall back on 
MPS's IRQ records?  Something else entirely?

-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing
@ 2002-08-26  7:05 Grover, Andrew
  0 siblings, 0 replies; 8+ messages in thread
From: Grover, Andrew @ 2002-08-26  7:05 UTC (permalink / raw)
  To: 'jamesclv@us.ibm.com', Andi Kleen; +Cc: linux-kernel

> From: James Cleverdon [mailto:jamesclv@us.ibm.com] 
> > What happens when you use the FULL ACPI support? I suspect 
> that you really
> > do want the interpreter, in order to evaluate _PRTs properly.

> Bingo!  With full ACPI turned on, the system does indeed 
> boot.  The extra I/O 
> APIC entries are being programmed from the PRT.
> 
> (Call chain is:  pci_acpi_init --> acpi_pci_irq_init --> 
> mp_parse_prt --> 
> io_apic_set_pci_routing)
> 
> So, given that quite a number of our customers would like to run with 
> hyperthreading turned on, but do not want full ACPI, what is 
> the right thing 
> to do in the HT-only case?  Add extra code to process the 
> PRT?  Fall back on 
> MPS's IRQ records?  Something else entirely?

The solution is ACPI. Full ACPI. What is the problem? I have devoted too
much time already to make  hybrid ACPI/MPS combos work, but that will never
be the right solution.

Please have your customers email me privately and tell me why ~100KB of mem
on a 1GB+ system is something us engineers should spend our valuable time
hacking around, when the correct solution already is implemented and
*works*.

Regards -- Andy

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2002-08-26  7:01 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <Pine.LNX.4.33.0208131421190.3110-100000@penguin.transmeta.com.suse.lists.linux.kernel>
     [not found] ` <200208131729.50127.habanero@us.ibm.com.suse.lists.linux.kernel>
     [not found]   ` <20020813233007.GV14394@dualathlon.random.suse.lists.linux.kernel>
     [not found]     ` <200208221931.35052.jamesclv@us.ibm.com.suse.lists.linux.kernel>
2002-08-23  7:11       ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing Andi Kleen
2002-08-23  8:48         ` William Lee Irwin III
2002-08-23 14:12         ` Martin J. Bligh
2002-08-23 21:36         ` James Cleverdon
2002-08-26  7:05 Grover, Andrew
  -- strict thread matches above, loose matches on Subject: below --
2002-08-24  0:29 Grover, Andrew
2002-08-26  1:59 ` James Cleverdon
2002-08-13 21:24 [PATCH] NUMA-Q disable irqbalance Linus Torvalds
2002-08-13 22:29 ` Andrew Theurer
2002-08-13 23:30   ` Andrea Arcangeli
2002-08-23  2:31     ` [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing James Cleverdon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox