* Summit patch for 2.4.19-rc3-ac2
@ 2002-07-23 4:21 James Cleverdon
2002-07-23 8:51 ` Lech Szychowski
` (4 more replies)
0 siblings, 5 replies; 17+ messages in thread
From: James Cleverdon @ 2002-07-23 4:21 UTC (permalink / raw)
To: linux-kernel; +Cc: Steven Cole
[-- Attachment #1: Type: text/plain, Size: 494 bytes --]
Here's a patch for those who have been plagued by APIC errors starting around
-rc1-ac6. I've submitted it to Alan, but since it has been affecting a
number of folks, I'm also posting it here for your consideration and review.
This fixes the APIC receive accept errors on the two machines we have that
were subject to it. Let me know if it doesn't work for you.
--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
[-- Attachment #2: 2.4.19-rc3-ac2_summit.2002-07-22 --]
[-- Type: text/x-diff, Size: 11594 bytes --]
diff -ruN 2.4.19-rc3-ac2/arch/i386/kernel/apic.c ac2/arch/i386/kernel/apic.c
--- 2.4.19-rc3-ac2/arch/i386/kernel/apic.c Mon Jul 22 20:03:41 2002
+++ ac2/arch/i386/kernel/apic.c Mon Jul 22 20:04:38 2002
@@ -261,20 +261,6 @@
apic_write_around(APIC_LVT1, value);
}
-/*
- * To build the logical APIC ID for each CPU we have three cases:
- * 1) Normal flat mode: use a bitmap of the CPU numbers
- * 2) Logical multi-quad (NUMA-Q): do nothing, the BIOS has set it up
- * 3) Physical multi-quad (xAPIC clusters): convert the Intel standard
- * physical APIC ID to a cluster nibble/cpu bitmap nibble
- *
- *** mps_cpu (index number): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ...
- *** CPUs have xAPIC phys IDs: 00, 01, 02, 03, 10, 11, 12, 13, 20, 21, ...
- *** its logical ID: 01, 02, 04, 08, 11, 12, 14, 18, 21, 22, ...
- */
-
-#define physical_to_logical_apicid(phys_apic) ( (1UL << (phys_apic & 0x3)) | (phys_apic & 0xF0U) )
-
static unsigned long apic_ldr_value(unsigned long value)
{
if (clustered_apic_logical)
diff -ruN 2.4.19-rc3-ac2/arch/i386/kernel/io_apic.c ac2/arch/i386/kernel/io_apic.c
--- 2.4.19-rc3-ac2/arch/i386/kernel/io_apic.c Mon Jul 22 20:03:41 2002
+++ ac2/arch/i386/kernel/io_apic.c Mon Jul 22 20:06:07 2002
@@ -760,7 +760,7 @@
* skip adding the timer int on secondary nodes, which causes
* a small but painful rift in the time-space continuum
*/
- if (clustered_apic_mode && (apic != 0) && (irq == 0))
+ if (clustered_apic_logical && (apic != 0) && (irq == 0))
continue;
else
add_pin_to_irq(irq, apic, pin);
diff -ruN 2.4.19-rc3-ac2/arch/i386/kernel/mpparse.c ac2/arch/i386/kernel/mpparse.c
--- 2.4.19-rc3-ac2/arch/i386/kernel/mpparse.c Mon Jul 22 20:03:41 2002
+++ ac2/arch/i386/kernel/mpparse.c Mon Jul 22 20:04:38 2002
@@ -162,7 +162,7 @@
if (!(m->mpc_cpuflag & CPU_ENABLED))
return;
- logical_apicid = m->mpc_apicid;
+ logical_apicid = 0x01;
if (clustered_apic_logical) {
quad = translation_table[mpc_record]->trans_quad;
logical_apicid = (quad << 4) +
diff -ruN 2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c ac2/arch/i386/kernel/smpboot.c
--- 2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c Mon Jul 22 20:03:41 2002
+++ ac2/arch/i386/kernel/smpboot.c Mon Jul 22 20:08:02 2002
@@ -511,59 +511,28 @@
return do_fork(CLONE_VM|CLONE_PID, 0, ®s, 0);
}
-/* which physical APIC ID maps to which logical CPU number */
-volatile unsigned char physical_apicid_2_cpu[MAX_APICID];
/* which logical CPU number maps to which physical APIC ID */
-volatile unsigned char cpu_2_physical_apicid[NR_CPUS];
+volatile u8 cpu_2_physical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-/* which logical APIC ID maps to which logical CPU number */
-volatile unsigned char logical_apicid_2_cpu[MAX_APICID];
/* which logical CPU number maps to which logical APIC ID */
-volatile unsigned char cpu_2_logical_apicid[NR_CPUS];
+volatile u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-static inline void init_cpu_to_apicid(void)
-/* Initialize all maps between cpu number and apicids */
-{
- int apicid, cpu;
-
- for (apicid = 0; apicid < MAX_APICID; apicid++) {
- physical_apicid_2_cpu[apicid] = BAD_APICID;
- logical_apicid_2_cpu[apicid] = BAD_APICID;
- }
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpu_2_physical_apicid[cpu] = BAD_APICID;
- cpu_2_logical_apicid[cpu] = BAD_APICID;
- }
-}
-
-static inline void map_cpu_to_boot_apicid(int cpu, int apicid)
+static inline void map_cpu_to_boot_apicid(int cpu, int phys_apicid, int log_apicid)
/*
- * set up a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
+ * set up a mapping between cpu and apicids.
*/
{
- if (clustered_apic_logical) {
- logical_apicid_2_cpu[apicid] = (unsigned char) cpu;
- cpu_2_logical_apicid[cpu] = (unsigned char) apicid;
- } else {
- physical_apicid_2_cpu[apicid] = (unsigned char) cpu;
- cpu_2_physical_apicid[cpu] = (unsigned char) apicid;
- }
+ cpu_2_logical_apicid[cpu] = (u8) log_apicid;
+ cpu_2_physical_apicid[cpu] = (u8) phys_apicid;
}
-static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid)
+static inline void unmap_cpu_to_boot_apicid(int cpu)
/*
- * undo a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
+ * undo a mapping between cpu and apicids.
*/
{
- if (clustered_apic_logical) {
- logical_apicid_2_cpu[apicid] = BAD_APICID;
- cpu_2_logical_apicid[cpu] = BAD_APICID;
- } else {
- physical_apicid_2_cpu[apicid] = BAD_APICID;
- cpu_2_physical_apicid[cpu] = BAD_APICID;
- }
+ cpu_2_logical_apicid[cpu] = BAD_APICID;
+ cpu_2_physical_apicid[cpu] = BAD_APICID;
}
#if APIC_DEBUG
@@ -777,17 +746,13 @@
extern unsigned long cpu_initialized;
-static void __init do_boot_cpu (int apicid)
-/*
- * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
- * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
- */
+static void __init do_boot_cpu (int phys_apicid, int log_apicid)
{
struct task_struct *idle;
unsigned long boot_error = 0;
int timeout, cpu;
unsigned long start_eip;
- unsigned short nmi_high, nmi_low;
+ unsigned short nmi_high = 0, nmi_low = 0;
cpu = ++cpucount;
/*
@@ -807,7 +772,7 @@
init_idle(idle, cpu);
- map_cpu_to_boot_apicid(cpu, apicid);
+ map_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);
idle->thread.eip = (unsigned long) start_secondary;
@@ -817,7 +782,7 @@
start_eip = setup_trampoline();
/* So we see what's up */
- printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+ printk("Booting processor %d/%d eip %lx\n", cpu, log_apicid, start_eip);
stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle);
/*
@@ -846,7 +811,7 @@
/*
* Be paranoid about clearing APIC errors.
*/
- if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[apicid])) {
+ if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[phys_apicid])) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
@@ -862,9 +827,9 @@
*/
if (clustered_apic_logical)
- boot_error = wakeup_secondary_via_NMI(apicid);
+ boot_error = wakeup_secondary_via_NMI(log_apicid);
else
- boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
+ boot_error = wakeup_secondary_via_INIT(phys_apicid, start_eip);
if (!boot_error) {
/*
@@ -900,13 +865,13 @@
printk("Not responding.\n");
#if APIC_DEBUG
if (!clustered_apic_mode)
- inquire_remote_apic(apicid);
+ inquire_remote_apic(phys_apicid);
#endif
}
}
if (boot_error) {
/* Try to put things back the way they were before ... */
- unmap_cpu_to_boot_apicid(cpu, apicid);
+ unmap_cpu_to_boot_apicid(cpu);
clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
clear_bit(cpu, &cpu_online_map); /* was set in smp_callin() */
@@ -975,7 +940,6 @@
extern int prof_old_multiplier[NR_CPUS];
extern int prof_counter[NR_CPUS];
-static int boot_cpu_logical_apicid;
#ifdef CONFIG_MULTIQUAD
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void *xquad_portio;
@@ -985,7 +949,7 @@
void __init smp_boot_cpus(void)
{
- int apicid, cpu, bit;
+ int phys_apicid, log_apicid, cpu, bit;
#ifdef CONFIG_MULTIQUAD
if (clustered_apic_logical && (numnodes > 1)) {
@@ -1014,8 +978,6 @@
prof_multiplier[cpu] = 1;
}
- init_cpu_to_apicid();
-
/*
* Setup boot CPU information
*/
@@ -1027,8 +989,13 @@
* We have the boot CPU online for sure.
*/
set_bit(0, &cpu_online_map);
- boot_cpu_logical_apicid = logical_smp_processor_id();
- map_cpu_to_boot_apicid(0, boot_cpu_apicid);
+ if (clustered_apic_physical)
+ boot_cpu_logical_apicid = physical_to_logical_apicid(boot_cpu_physical_apicid);
+ else if (clustered_apic_logical)
+ boot_cpu_logical_apicid = logical_smp_processor_id();
+ else
+ boot_cpu_logical_apicid = 0x01;
+ map_cpu_to_boot_apicid(0, boot_cpu_physical_apicid, boot_cpu_logical_apicid);
global_irq_holder = 0;
current->cpu = 0;
@@ -1111,27 +1078,32 @@
Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
for (bit = 0; bit < NR_CPUS; bit++) {
- apicid = cpu_present_to_apicid(bit);
- /*
- * Don't even attempt to start the boot CPU!
- */
- if (apicid == boot_cpu_apicid)
- continue;
-
if (!(phys_cpu_present_map & (1UL << bit)))
continue;
if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
continue;
+ phys_apicid = raw_phys_apicid[bit];
+ /*
+ * Don't even attempt to start the boot CPU!
+ */
+ if (phys_apicid == boot_cpu_physical_apicid)
+ continue;
+ if (clustered_apic_physical)
+ log_apicid = physical_to_logical_apicid(phys_apicid);
+ else if (clustered_apic_logical)
+ log_apicid = ((bit >> 2) << 4) | (1 << (bit & 0x3));
+ else
+ log_apicid = 1u << bit;
- do_boot_cpu(apicid);
+ do_boot_cpu(phys_apicid, log_apicid);
/*
* Make sure we unmap all failed CPUs
*/
- if ((boot_apicid_to_cpu(apicid) == -1) &&
- (phys_cpu_present_map & (1 << bit)))
+ if ((cpu_to_physical_apicid(bit) == BAD_APICID) &&
+ (phys_cpu_present_map & (1ul << bit)))
printk("CPU #%d not responding - cannot use it.\n",
- apicid);
+ bit);
}
/*
diff -ruN 2.4.19-rc3-ac2/include/asm-i386/mpspec.h ac2/include/asm-i386/mpspec.h
--- 2.4.19-rc3-ac2/include/asm-i386/mpspec.h Mon Jul 22 20:03:42 2002
+++ ac2/include/asm-i386/mpspec.h Mon Jul 22 20:04:38 2002
@@ -201,6 +201,7 @@
extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int boot_cpu_logical_apicid;
extern unsigned long phys_cpu_present_map;
extern int smp_found_config;
extern void find_smp_config (void);
diff -ruN 2.4.19-rc3-ac2/include/asm-i386/smpboot.h ac2/include/asm-i386/smpboot.h
--- 2.4.19-rc3-ac2/include/asm-i386/smpboot.h Mon Jul 22 20:03:42 2002
+++ ac2/include/asm-i386/smpboot.h Mon Jul 22 20:04:38 2002
@@ -12,13 +12,27 @@
extern unsigned char raw_phys_apicid[NR_CPUS];
+/*
+ * To build the logical APIC ID for each CPU we have three cases:
+ * 1) Normal flat mode: use a bitmap of the CPU numbers
+ * 2) Logical multi-quad (NUMA-Q): do nothing, the BIOS has set it up
+ * 3) Physical multi-quad (xAPIC clusters): convert the Intel standard
+ * physical APIC ID to a cluster nibble/cpu bitmap nibble
+ *
+ *** mps_cpu (index number): 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ...
+ *** CPUs have xAPIC phys IDs: 00, 01, 02, 03, 10, 11, 12, 13, 20, 21, ...
+ *** its logical ID: 01, 02, 04, 08, 11, 12, 14, 18, 21, 22, ...
+ */
+
+#define physical_to_logical_apicid(phys_apic) ( (1ul << (phys_apic & 0x3)) | (phys_apic & 0xF0u) )
+
static inline int cpu_present_to_apicid(int mps_cpu)
{
if(clustered_apic_logical)
return (mps_cpu/4)*16 + (1<<(mps_cpu%4));
if(clustered_apic_physical)
return raw_phys_apicid[mps_cpu];
- return mps_cpu;
+ return 1 << mps_cpu;
}
static inline unsigned long apicid_to_phys_cpu_present(int apicid)
@@ -33,10 +47,8 @@
* The first four macros are trivial, but it keeps the abstraction consistent
*/
-extern volatile unsigned char logical_apicid_2_cpu[];
-extern volatile unsigned char cpu_2_logical_apicid[];
-extern volatile unsigned char physical_apicid_2_cpu[];
-extern volatile unsigned char cpu_2_physical_apicid[];
+extern volatile u8 cpu_2_logical_apicid[];
+extern volatile u8 cpu_2_physical_apicid[];
#define logical_apicid_to_cpu(apicid) (int)logical_apicid_2_cpu[apicid]
#define cpu_to_logical_apicid(cpu) (int)cpu_2_logical_apicid[cpu]
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: Summit patch for 2.4.19-rc3-ac2
2002-07-23 4:21 Summit patch for 2.4.19-rc3-ac2 James Cleverdon
@ 2002-07-23 8:51 ` Lech Szychowski
2002-07-23 12:03 ` 2.4.19-rc3-ac2 SMP Zwane Mwaikambo
` (3 subsequent siblings)
4 siblings, 0 replies; 17+ messages in thread
From: Lech Szychowski @ 2002-07-23 8:51 UTC (permalink / raw)
To: James Cleverdon; +Cc: linux-kernel
> This fixes the APIC receive accept errors on the two machines we have that
> were subject to it. Let me know if it doesn't work for you.
This patch works for my Asus P2B-DS with 2xPII(Deschutes)-400.
--
Leszek.
-- lech7@pse.pl 2:480/33.7 -- REAL programmers use INTEGERS --
-- speaking just for myself...
^ permalink raw reply [flat|nested] 17+ messages in thread
* 2.4.19-rc3-ac2 SMP
2002-07-23 4:21 Summit patch for 2.4.19-rc3-ac2 James Cleverdon
2002-07-23 8:51 ` Lech Szychowski
@ 2002-07-23 12:03 ` Zwane Mwaikambo
2002-07-23 12:11 ` Zwane Mwaikambo
2002-07-23 13:30 ` Summit patch for 2.4.19-rc3-ac2 James Bourne
` (2 subsequent siblings)
4 siblings, 1 reply; 17+ messages in thread
From: Zwane Mwaikambo @ 2002-07-23 12:03 UTC (permalink / raw)
To: Alan Cox, James Cleverdon; +Cc: Linux Kernel
Hi Alan, James
This is what i have so far, i'll probably have time to really
debug it when i get home later today. The problem persists with Jame's
Summit patch applied too (just in case there were other fixes there).
Intel MultiProcessor Specification v1.4
Virtual Wire compatibility mode.
OEM ID: OEM00000 Product ID: 0.1 APIC at: 0xFEE00000
Processor #0 Pentium(tm) Pro APIC version 17
Processor #1 Pentium(tm) Pro APIC version 17
Processor #2 Pentium(tm) Pro APIC version 17
Processor #3 Pentium(tm) Pro APIC version 17
I/O APIC #4 Version 17 at 0xFEC00000.
Enabling APIC mode: Flat. Using 1 I/O APICs
Processors: 4
[...]
ENABLING IO-APIC IRQs
Setting 4 in the phys_id_present_map
...changing IO-APIC physical APIC ID to 4 ... ok.
..TIMER: vector=0x31 pin1=0 pin2=-1
<dead>
Around here the machine gets a vector 0x31 (timer) interrupt on CPU0 then
locks up since the destination cpu bitmask is 0, It also seems that the
code is trying to use logical apic id in places instead of the physical
apic id, i saw attempted deliveries to physical apic id 4 and 8, this can
possibly explain the APIC receive errors people were reporting?
Unfortunately this is the only info i have right now, i thought i'd be
able to gather more but looks like i'll have to wait till i get home.
As a control, the machine boots and runs 2.4.19-rc2
Regards,
Zwane Mwaikambo
Alan on a side note, would you like a patch to bring around some fixes
from 2.5 irq_balance? Your kernel can't boot UP w/ IOAPIC
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-23 12:03 ` 2.4.19-rc3-ac2 SMP Zwane Mwaikambo
@ 2002-07-23 12:11 ` Zwane Mwaikambo
2002-07-23 18:50 ` James Cleverdon
0 siblings, 1 reply; 17+ messages in thread
From: Zwane Mwaikambo @ 2002-07-23 12:11 UTC (permalink / raw)
To: Alan Cox, James Cleverdon; +Cc: Linux Kernel
On Tue, 23 Jul 2002, Zwane Mwaikambo wrote:
> Around here the machine gets a vector 0x31 (timer) interrupt on CPU0 then
> locks up since the destination cpu bitmask is 0, It also seems that the
> code is trying to use logical apic id in places instead of the physical
> apic id, i saw attempted deliveries to physical apic id 4 and 8, this can
> possibly explain the APIC receive errors people were reporting?
Correction, the logical/physical apic id problem doesn't appear to be
there with the summit patch. What i'm currently seeing is a destination of
0 with a non flat/physical destination format.
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: Summit patch for 2.4.19-rc3-ac2
2002-07-23 4:21 Summit patch for 2.4.19-rc3-ac2 James Cleverdon
2002-07-23 8:51 ` Lech Szychowski
2002-07-23 12:03 ` 2.4.19-rc3-ac2 SMP Zwane Mwaikambo
@ 2002-07-23 13:30 ` James Bourne
2002-07-23 13:42 ` Steven Cole
2002-07-23 14:34 ` Philippe Gramoullé
4 siblings, 0 replies; 17+ messages in thread
From: James Bourne @ 2002-07-23 13:30 UTC (permalink / raw)
To: linux-kernel; +Cc: James Cleverdon, Alan Cox
On Mon, 22 Jul 2002, James Cleverdon wrote:
> Here's a patch for those who have been plagued by APIC errors starting around
> -rc1-ac6. I've submitted it to Alan, but since it has
> been affecting a number of folks, I'm also posting it here for
> your consideration and review.
>
> This fixes the APIC receive accept errors on the two machines we have
> that were subject to it. Let me know if it doesn't work for you.
I does work with the dell 2400.
Same config I forwarded to the list, minus the RZ1000, PIIX, CMD640, and
pcmcia bits.
Regards
James Bourne
--
James Bourne, Supervisor Data Centre Operations
Mount Royal College, Calgary, AB, CA
www.mtroyal.ab.ca
******************************************************************************
This communication is intended for the use of the recipient to which it is
addressed, and may contain confidential, personal, and or privileged
information. Please contact the sender immediately if you are not the
intended recipient of this communication, and do not copy, distribute, or
take action relying on it. Any communication received in error, or
subsequent reply, should be deleted or destroyed.
******************************************************************************
"There are only 10 types of people in this world: those who
understand binary and those who don't."
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: Summit patch for 2.4.19-rc3-ac2
2002-07-23 4:21 Summit patch for 2.4.19-rc3-ac2 James Cleverdon
` (2 preceding siblings ...)
2002-07-23 13:30 ` Summit patch for 2.4.19-rc3-ac2 James Bourne
@ 2002-07-23 13:42 ` Steven Cole
2002-07-23 14:34 ` Philippe Gramoullé
4 siblings, 0 replies; 17+ messages in thread
From: Steven Cole @ 2002-07-23 13:42 UTC (permalink / raw)
To: jamesclv; +Cc: linux-kernel, Alan Cox
On Mon, 2002-07-22 at 22:21, James Cleverdon wrote:
> Here's a patch for those who have been plagued by APIC errors starting around
> -rc1-ac6. I've submitted it to Alan, but since it has been affecting a
> number of folks, I'm also posting it here for your consideration and review.
>
> This fixes the APIC receive accept errors on the two machines we have that
> were subject to it. Let me know if it doesn't work for you.
Thanks. That worked for my Intel STL2 with 2 x P-III (Coppermine).
Steven
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: Summit patch for 2.4.19-rc3-ac2
2002-07-23 4:21 Summit patch for 2.4.19-rc3-ac2 James Cleverdon
` (3 preceding siblings ...)
2002-07-23 13:42 ` Steven Cole
@ 2002-07-23 14:34 ` Philippe Gramoullé
4 siblings, 0 replies; 17+ messages in thread
From: Philippe Gramoullé @ 2002-07-23 14:34 UTC (permalink / raw)
To: jamesclv; +Cc: linux-kernel
On Mon, 22 Jul 2002 21:21:04 -0700
James Cleverdon <jamesclv@us.ibm.com> wrote:
Thanks, it works now on both DELL MT 530 (2xPIII Xeon 1.5Ghz) and DELL 2450 ( 2xPIII Copermine 1gHZ).
Before, the MT 530 was halted on the SCSI probe while the 2450 was stucked with the APIC error message
on CPU #0
Thanks,
Philippe.
| This fixes the APIC receive accept errors on the two machines we have that
| were subject to it. Let me know if it doesn't work for you.
|
| --
| James Cleverdon
| IBM xSeries Linux Solutions
| {jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
|
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-23 12:11 ` Zwane Mwaikambo
@ 2002-07-23 18:50 ` James Cleverdon
2002-07-24 15:26 ` Zwane Mwaikambo
0 siblings, 1 reply; 17+ messages in thread
From: James Cleverdon @ 2002-07-23 18:50 UTC (permalink / raw)
To: Zwane Mwaikambo, Alan Cox; +Cc: Linux Kernel
[-- Attachment #1: Type: text/plain, Size: 1197 bytes --]
On Tuesday 23 July 2002 05:11 am, Zwane Mwaikambo wrote:
> On Tue, 23 Jul 2002, Zwane Mwaikambo wrote:
> > Around here the machine gets a vector 0x31 (timer) interrupt on CPU0 then
> > locks up since the destination cpu bitmask is 0, It also seems that the
> > code is trying to use logical apic id in places instead of the physical
> > apic id, i saw attempted deliveries to physical apic id 4 and 8, this can
> > possibly explain the APIC receive errors people were reporting?
>
> Correction, the logical/physical apic id problem doesn't appear to be
> there with the summit patch. What i'm currently seeing is a destination of
> 0 with a non flat/physical destination format.
Drat! I thought I had all the logical vs. physical stuff straightened out.
Could you give this patch a try? It dumps all kinds of APIC state info.
You'll need to put a call to apic_state_dump() into check_timer() just after
the TIMER: printk.
(Hmmm.... Must clean up this patch and submit it to kdb as two new commands,
one for I/O APICs and one for local APICs....)
--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
[-- Attachment #2: apic_error.2.4.19-rc1-ac7 --]
[-- Type: text/x-diff, Size: 3371 bytes --]
--- 2.4.19-rc1-ac7/arch/i386/kernel/apic.c Wed Jul 17 12:02:50 2002
+++ ac7/arch/i386/kernel/apic.c Thu Jul 18 19:41:57 2002
@@ -1131,13 +1131,100 @@
smp_processor_id());
}
+static spinlock_t apic_dump_lock = SPIN_LOCK_UNLOCKED;
+
+static void
+apic_bit_vector_dump(unsigned long addr, char *name)
+{
+ int n;
+
+ printk("%s:", name);
+ for (n = 256 / 32; --n >= 0; addr += 0x10) {
+ printk(" %08lX", apic_read(addr));
+ }
+ printk("\n");
+}
+
+void
+print_ioapic_rtes(void)
+{
+ register int apic, rte, rte_max;
+
+ for (apic = 0; apic < nr_ioapics; apic++) {
+ printk("I/O APIC # %d:", apic);
+ rte_max = nr_ioapic_registers[apic];
+ for (rte = 0; rte < rte_max; rte++) {
+ if ((rte & 0x3) == 0)
+ printk("\n%02X:", rte);
+ printk(" %08X:%08X",
+ io_apic_read(apic, 0x10 + rte*2),
+ io_apic_read(apic, 0x10 + 1 + rte*2));
+ }
+ printk("\n");
+ }
+}
+
+/* Set breakpoint here. */
+void
+apic_state_dump_bp(void)
+{
+ cpu_relax();
+}
+
+/*
+ * apic_state_dump -- Print large amounts of APIC and related info.
+ */
+
+void
+apic_state_dump(void)
+{
+ register int v;
+ unsigned long flags;
+
+ spin_lock_irqsave(&apic_dump_lock, flags);
+
+ printk("ID=0x%08lX, LVR=0x%08lX, TPR=0x%08lX, ARB=0x%08lX, PROCPRI=0x%08lX\n", apic_read(APIC_ID), apic_read(APIC_LVR), apic_read(APIC_TASKPRI), apic_read(APIC_ARBPRI), apic_read(APIC_PROCPRI));
+ printk("DFR=0x%08lX, LDR=0x%08lX, ICR=0x%08lX\n", apic_read(APIC_DFR), apic_read(APIC_LDR), apic_read(APIC_ICR));
+ printk("SPIV=0x%08lX, ICR=0x%08lX, ICR2=0x%08lX, LVTT=0x%08lX, LVTPC=0x%08lX\n", apic_read(APIC_SPIV), apic_read(APIC_ICR), apic_read(APIC_ICR2), apic_read(APIC_LVTT), apic_read(APIC_LVTPC));
+ printk("LVT0=0x%08lX, LVT1=0x%08lX, LVTERR=0x%08lX\n", apic_read(APIC_LVT0), apic_read(APIC_LVT1), apic_read(APIC_LVTERR));
+ apic_bit_vector_dump(APIC_ISR, "ISR");
+ apic_bit_vector_dump(APIC_TMR, "TMR");
+ apic_bit_vector_dump(APIC_IRR, "IRR");
+ printk("clustered_apic_mode=%d, esr_disable=%d, target_cpus=0x%02X\n", clustered_apic_mode, esr_disable, (u32)target_cpus);
+ printk("apic_broadcast_id=0x%02X\n", (u32)apic_broadcast_id);
+ printk("raw_phys_apicid[]= ");
+ for (v = 0; v < NR_CPUS; v++) {
+ printk(" %02X", raw_phys_apicid[v]);
+ }
+ printk("\n");
+ printk("cpu_2_logical_apicid[]= ");
+ for (v = 0; v < NR_CPUS; v++) {
+ printk(" %02X", cpu_2_logical_apicid[v]);
+ }
+ printk("\n");
+ printk("cpu_2_physical_apicid[]=");
+ for (v = 0; v < NR_CPUS; v++) {
+ printk(" %02X", cpu_2_physical_apicid[v]);
+ }
+ printk("\n");
+ print_ioapic_rtes();
+
+ spin_unlock_irqrestore(&apic_dump_lock, flags);
+ apic_state_dump_bp();
+}
+
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
+static spinlock_t smp_error_intr_lock = SPIN_LOCK_UNLOCKED;
+
asmlinkage void smp_error_interrupt(void)
{
unsigned long v, v1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&smp_error_intr_lock, flags);
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
@@ -1158,6 +1245,11 @@
*/
printk (KERN_ERR "APIC error on CPU%d: %02lx(%02lx)\n",
smp_processor_id(), v , v1);
+ apic_state_dump();
+ /* APICs tend to spasm when they get errors. Disable the error intr. */
+ apic_write_around(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
+
+ spin_unlock_irqrestore(&smp_error_intr_lock, flags);
}
/*
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-23 18:50 ` James Cleverdon
@ 2002-07-24 15:26 ` Zwane Mwaikambo
2002-07-24 22:50 ` James Cleverdon
2002-07-25 3:34 ` James Cleverdon
0 siblings, 2 replies; 17+ messages in thread
From: Zwane Mwaikambo @ 2002-07-24 15:26 UTC (permalink / raw)
To: James Cleverdon; +Cc: Alan Cox, Linux Kernel
On Tue, 23 Jul 2002, James Cleverdon wrote:
> Drat! I thought I had all the logical vs. physical stuff straightened out.
> Could you give this patch a try? It dumps all kinds of APIC state info.
> You'll need to put a call to apic_state_dump() into check_timer() just after
> the TIMER: printk.
ID=0x02000000, LVR=0x00170011, TPR=0x00000000, ARB=0x00000002, PROCPRI=0x000000F0
DFR=0x0FFFFFFF, LDR=0x01000000, ICR=0x00088500, SPIV=0x000001FF, ICR=0x00088500,
ICR2=0x03000000, LVTT=0x00000000, LVTPC=0x00000000
LVT0=0x00010700, LVT1=0x00000400, LVTERR=0x000000FE
ISR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
TMR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
IRR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
clustered_apic_mode=0, esr_disable=0, target_cpus=0x00 apic_broadcast_id=0x0F
raw_phys_apicid[]= 00 01 02 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00 00 00 00 00 00
cpu_2_logical_apicid[]= 01 01 02 08 FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
FF FF FF FF FF FF FF FF FF FF FF FF FF
cpu_2_physical_apicid[]= 02 00 01 03 FF FF FF FF FF FF FF FF FF FF FF FF FF FF F
F FF FF FF FF FF FF FF FF FF FF FF FF FF
I/O APIC # 0:
00: 00000931:04000000 00010939:02000000 00010000:00000000 00000941:0F000000
04: 00000949:0F000000 00000951:0F000000 00010959:02000000 00000961:0F000000
08: 00000969:0F000000 00000971:0F000000 00000979:0F000000 00000981:0F000000
0C: 00000989:0F000000 00000991:0F000000 00000999:0F000000 000009A1:0F000000
10: 00010000:00000000 00010000:00000000 00010000:00000000 00010000:00000000
14: 00010000:00000000 00010000:00000000 00010000:00000000 00010000:00000000
> (Hmmm.... Must clean up this patch and submit it to kdb as two new commands,
> one for I/O APICs and one for local APICs....)
i'd vote for that =) except for one thing.
+ /* APICs tend to spasm when they get errors. Disable the error intr. */
+ apic_write_around(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
Isn't that a bit drastic?
Regards,
Zwane
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
@ 2002-07-24 17:28 Mikael Pettersson
2002-07-25 20:48 ` James Cleverdon
0 siblings, 1 reply; 17+ messages in thread
From: Mikael Pettersson @ 2002-07-24 17:28 UTC (permalink / raw)
To: jamesclv, zwane; +Cc: alan, linux-kernel
On Wed, 24 Jul 2002 17:26:49 +0200 (SAST), Zwane Mwaikambo wrote:
>i'd vote for that =) except for one thing.
>
>+ /* APICs tend to spasm when they get errors. Disable the error intr. */
>+ apic_write_around(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
>
>Isn't that a bit drastic?
Drastic is an understatement. Try "gross". Sane machines running correct
code shouldn't throw local APIC errors. If something's causing errors,
that something should be fixed, not hidden.
I hope that was just a temporary debug hack and not part of the design...
/Mikael
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-24 15:26 ` Zwane Mwaikambo
@ 2002-07-24 22:50 ` James Cleverdon
2002-07-25 3:34 ` James Cleverdon
1 sibling, 0 replies; 17+ messages in thread
From: James Cleverdon @ 2002-07-24 22:50 UTC (permalink / raw)
To: Zwane Mwaikambo; +Cc: Alan Cox, Linux Kernel
On Wednesday 24 July 2002 08:26 am, Zwane Mwaikambo wrote:
> On Tue, 23 Jul 2002, James Cleverdon wrote:
> > Drat! I thought I had all the logical vs. physical stuff straightened
> > out. Could you give this patch a try? It dumps all kinds of APIC state
> > info. You'll need to put a call to apic_state_dump() into check_timer()
> > just after the TIMER: printk.
>
> ID=0x02000000, LVR=0x00170011, TPR=0x00000000, ARB=0x00000002,
> PROCPRI=0x000000F0 DFR=0x0FFFFFFF, LDR=0x01000000, ICR=0x00088500,
> SPIV=0x000001FF, ICR=0x00088500, ICR2=0x03000000, LVTT=0x00000000,
> LVTPC=0x00000000
> LVT0=0x00010700, LVT1=0x00000400, LVTERR=0x000000FE
> ISR: 00000000 00000000 00000000 00000000 00000000 00000000 00000000
> 00000000 TMR: 00000000 00000000 00000000 00000000 00000000 00000000
> 00000000 00000000 IRR: 00000000 00000000 00000000 00000000 00000000
> 00000000 00000000 00000000 clustered_apic_mode=0, esr_disable=0,
> target_cpus=0x00 apic_broadcast_id=0x0F raw_phys_apicid[]= 00 01 02
> 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00 00 00 00
> cpu_2_logical_apicid[]= 01 01 02 08 FF FF FF FF FF FF FF FF FF FF FF FF FF
> FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
The logical numbers are bad. They should go 01 02 04 08... on a flat box.
(NUMA boxes are different.) I'll double check the code; am probably making
stupid assumptions, especially given the physical IDs below.
> cpu_2_physical_apicid[]= 02 00 01 03 FF FF FF FF FF FF FF FF FF FF FF FF FF
> FF F F FF FF FF FF FF FF FF FF FF FF FF FF FF
> I/O APIC # 0:
> 00: 00000931:04000000 00010939:02000000 00010000:00000000 00000941:0F000000
> 04: 00000949:0F000000 00000951:0F000000 00010959:02000000 00000961:0F000000
> 08: 00000969:0F000000 00000971:0F000000 00000979:0F000000 00000981:0F000000
> 0C: 00000989:0F000000 00000991:0F000000 00000999:0F000000 000009A1:0F000000
> 10: 00010000:00000000 00010000:00000000 00010000:00000000 00010000:00000000
> 14: 00010000:00000000 00010000:00000000 00010000:00000000 00010000:00000000
>
> > (Hmmm.... Must clean up this patch and submit it to kdb as two new
> > commands, one for I/O APICs and one for local APICs....)
>
> i'd vote for that =) except for one thing.
>
> + /* APICs tend to spasm when they get errors. Disable the error
> intr. */ + apic_write_around(APIC_LVTERR, ERROR_APIC_VECTOR |
> APIC_LVT_MASKED);
>
> Isn't that a bit drastic?
No. ;^)
When a local APIC weirds out and starts spewing interrupts as fast as it can
generate them, it can completely paralyze the system. I suspect that the
APIC error states aren't as well tested as I'd like. Anyway, turning off the
error interrupt is the only way to make enough forward progress to get a
clean shutdown. We had these problems with NUMA P6 boxes. Despite an APIC
bus analyzer pod on the logic analyzers, we never could find out what was
making them spasm or find any other halfway satisfactory solution, other than
to turn off the error interrupt.
> Regards,
> Zwane
--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-24 15:26 ` Zwane Mwaikambo
2002-07-24 22:50 ` James Cleverdon
@ 2002-07-25 3:34 ` James Cleverdon
2002-07-25 7:11 ` Zwane Mwaikambo
2002-07-25 13:26 ` Zwane Mwaikambo
1 sibling, 2 replies; 17+ messages in thread
From: James Cleverdon @ 2002-07-25 3:34 UTC (permalink / raw)
To: Zwane Mwaikambo; +Cc: Alan Cox, Linux Kernel
[-- Attachment #1: Type: text/plain, Size: 920 bytes --]
On Wednesday 24 July 2002 08:26 am, Zwane Mwaikambo wrote:
[ Snip! ]
>raw_phys_apicid[]= 00 01 02 03 00 00 00 00 00 00 00 00 00 00 00 00 00
>00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> cpu_2_logical_apicid[]= 01 01 02 08 FF FF FF FF FF FF FF FF FF FF FF FF FF
> FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
> cpu_2_physical_apicid[]= 02 00 01 03 FF FF FF FF FF FF FF FF FF FF FF FF FF
> FF F F FF FF FF FF FF FF FF FF FF FF FF FF FF
Ah ha! Note that while the CPU records in the {MPS,ACPI/MADT} table are in
numerical order (as preserved in raw_phys_apicid), the boot CPU is # 02. The
flat code in smp_boot_cpus assumes that the boot CPU will be the first record
in the list. Oops.
Try the attached patch and see if it helps.
[ Snip! ]
>
> Regards,
> Zwane
--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
[-- Attachment #2: 2.4.19-rc3-ac3_flat_hack.2002-07-24 --]
[-- Type: text/x-diff, Size: 715 bytes --]
--- ac3/arch/i386/kernel/smpboot.c.df Tue Jul 23 15:02:49 2002
+++ ac3/arch/i386/kernel/smpboot.c Wed Jul 24 18:02:24 2002
@@ -1077,6 +1077,7 @@
*/
Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
+ cpu = 1;
for (bit = 0; bit < NR_CPUS; bit++) {
if (!(phys_cpu_present_map & (1UL << bit)))
continue;
@@ -1093,7 +1094,7 @@
else if (clustered_apic_logical)
log_apicid = ((bit >> 2) << 4) | (1 << (bit & 0x3));
else
- log_apicid = 1u << bit;
+ log_apicid = 1u << cpu;
do_boot_cpu(phys_apicid, log_apicid);
@@ -1104,6 +1105,8 @@
(phys_cpu_present_map & (1ul << bit)))
printk("CPU #%d not responding - cannot use it.\n",
bit);
+ else
+ ++cpu;
}
/*
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-25 3:34 ` James Cleverdon
@ 2002-07-25 7:11 ` Zwane Mwaikambo
2002-07-25 20:29 ` James Cleverdon
2002-07-25 13:26 ` Zwane Mwaikambo
1 sibling, 1 reply; 17+ messages in thread
From: Zwane Mwaikambo @ 2002-07-25 7:11 UTC (permalink / raw)
To: James Cleverdon; +Cc: Alan Cox, Linux Kernel
On Wed, 24 Jul 2002, James Cleverdon wrote:
> Ah ha! Note that while the CPU records in the {MPS,ACPI/MADT} table are in
> numerical order (as preserved in raw_phys_apicid), the boot CPU is # 02. The
> flat code in smp_boot_cpus assumes that the boot CPU will be the first record
> in the list. Oops.
Ok i'll give it a whirl, in that case how about the following code to do
the BSP check in another area too?
Index: linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c
===================================================================
RCS file: /home/zwane/source/cvs_rep/linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c,v
retrieving revision 1.2
diff -u -r1.2 smpboot.c
--- linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c 2002/07/25 06:06:56 1.2
+++ linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c 2002/07/25 06:15:05
@@ -46,6 +46,7 @@
#include <asm/mtrr.h>
#include <asm/pgalloc.h>
#include <asm/smpboot.h>
+#include <asm/msr.h>
/* Set if we find a B stepping CPU */
static int smp_b_stepping;
@@ -229,6 +230,14 @@
return res;
}
+int smp_cpu_is_bsp (void)
+{
+ unsigned long l, h;
+
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ return (l & MSR_IA32_APICBASE_BSP);
+}
+
static void __init synchronize_tsc_bp (void)
{
int i;
@@ -1067,7 +1076,7 @@
connect_bsp_APIC();
setup_local_APIC();
- if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid)
+ if (!smp_cpu_is_bsp())
BUG();
/*
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-25 3:34 ` James Cleverdon
2002-07-25 7:11 ` Zwane Mwaikambo
@ 2002-07-25 13:26 ` Zwane Mwaikambo
1 sibling, 0 replies; 17+ messages in thread
From: Zwane Mwaikambo @ 2002-07-25 13:26 UTC (permalink / raw)
To: James Cleverdon; +Cc: Alan Cox, Linux Kernel
On Wed, 24 Jul 2002, James Cleverdon wrote:
> Ah ha! Note that while the CPU records in the {MPS,ACPI/MADT} table are in
> numerical order (as preserved in raw_phys_apicid), the boot CPU is # 02. The
> flat code in smp_boot_cpus assumes that the boot CPU will be the first record
> in the list. Oops.
>
> Try the attached patch and see if it helps.
Ok that one goes all the way, but i don't think i've covered everything
(e.g. tested all the IPI functions). But otherwise looks good, i'll give
it a go on a bigger box later (tested on 4-way, i'll try 12)
Cheers,
Zwane
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-25 7:11 ` Zwane Mwaikambo
@ 2002-07-25 20:29 ` James Cleverdon
0 siblings, 0 replies; 17+ messages in thread
From: James Cleverdon @ 2002-07-25 20:29 UTC (permalink / raw)
To: Zwane Mwaikambo; +Cc: Alan Cox, Linux Kernel
On Thursday 25 July 2002 12:11 am, Zwane Mwaikambo wrote:
> On Wed, 24 Jul 2002, James Cleverdon wrote:
> > Ah ha! Note that while the CPU records in the {MPS,ACPI/MADT} table are
> > in numerical order (as preserved in raw_phys_apicid), the boot CPU is #
> > 02. The flat code in smp_boot_cpus assumes that the boot CPU will be the
> > first record in the list. Oops.
>
> Ok i'll give it a whirl, in that case how about the following code to do
> the BSP check in another area too?
That would work, too. The bug was not in recognizing the boot cpu, but in
assuming that we would continue (for one reason or another) the first time
around the loop, because logical ID 0x01 was already assigned.
The previous code got around this dependency in a weird and rather kludgey
way. Check -rc3 for the two instances of boot_cpu_logical_apicid in
mpparse.c and smpboot.c, and the two entirely different sources of
boot_cpu_logical_apicid's value. Bizarre.
> Index: linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c
> ===================================================================
> RCS file:
> /home/zwane/source/cvs_rep/linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c,
>v retrieving revision 1.2
> diff -u -r1.2 smpboot.c
> --- linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c 2002/07/25 06:06:56 1.2
> +++ linux-2.4.19-rc3-ac2/arch/i386/kernel/smpboot.c 2002/07/25 06:15:05
> @@ -46,6 +46,7 @@
> #include <asm/mtrr.h>
> #include <asm/pgalloc.h>
> #include <asm/smpboot.h>
> +#include <asm/msr.h>
>
> /* Set if we find a B stepping CPU */
> static int smp_b_stepping;
> @@ -229,6 +230,14 @@
> return res;
> }
>
> +int smp_cpu_is_bsp (void)
> +{
> + unsigned long l, h;
> +
> + rdmsr(MSR_IA32_APICBASE, l, h);
> + return (l & MSR_IA32_APICBASE_BSP);
> +}
> +
> static void __init synchronize_tsc_bp (void)
> {
> int i;
> @@ -1067,7 +1076,7 @@
> connect_bsp_APIC();
> setup_local_APIC();
>
> - if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid)
> + if (!smp_cpu_is_bsp())
> BUG();
>
> /*
--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-24 17:28 2.4.19-rc3-ac2 SMP Mikael Pettersson
@ 2002-07-25 20:48 ` James Cleverdon
2002-07-26 10:31 ` Zwane Mwaikambo
0 siblings, 1 reply; 17+ messages in thread
From: James Cleverdon @ 2002-07-25 20:48 UTC (permalink / raw)
To: Mikael Pettersson, zwane; +Cc: alan, linux-kernel
On Wednesday 24 July 2002 10:28 am, Mikael Pettersson wrote:
> On Wed, 24 Jul 2002 17:26:49 +0200 (SAST), Zwane Mwaikambo wrote:
> >i'd vote for that =) except for one thing.
> >
> >+ /* APICs tend to spasm when they get errors. Disable the error
> > intr. */ + apic_write_around(APIC_LVTERR, ERROR_APIC_VECTOR |
> > APIC_LVT_MASKED);
> >
> >Isn't that a bit drastic?
>
> Drastic is an understatement. Try "gross". Sane machines running correct
> code shouldn't throw local APIC errors. If something's causing errors,
> that something should be fixed, not hidden.
>
> I hope that was just a temporary debug hack and not part of the design...
>
> /Mikael
On the contrary, when Intel moved the local APIC from a separate chip onto the
CPU around the time of the P54C, they hobbled it. Formerly, it could accept
and latch any number of interrupts because it contained three bit vectors
that could store all the necessary state info. The P54C (and later) version
had two latches per interrupt level. The level was defined as the top nibble
of the interrupt vector. So, P54Cs could only latch two interrupts for, say,
the 0x31-0x3F range for ISA IRQs. Too bad if three 0x3X interrupts arrive.
Number 3 cannot be latched.
Intel added new error states to the local APIC and the bus protocol to allow
for interrupts to _not_ be delivered, thanks to the latch limit. On a busy
system with lots of interrupts, you will sometimes see several of these
receive accept errors per day. There is nothing you can do to fix the
condition, aside from processing all . It really is more of a warning than
an error.
On our NUMA P6 box, we found that the local APICs would occasionally start
spasming with error interrupts. An APIC bus analyzer didn't show any kind of
errors on the APIC bus. They would just weird out and all attempts to clear
the error had no effect. We never did find a solution to that one or get an
adequate explanation from Intel. The only kludge that worked was to turn off
the APIC error interrupt.
Naturally, the cleaned up version of the apic_state_dump patch wouldn't do
that, or would make it an option.
--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: 2.4.19-rc3-ac2 SMP
2002-07-25 20:48 ` James Cleverdon
@ 2002-07-26 10:31 ` Zwane Mwaikambo
0 siblings, 0 replies; 17+ messages in thread
From: Zwane Mwaikambo @ 2002-07-26 10:31 UTC (permalink / raw)
To: James Cleverdon; +Cc: Mikael Pettersson, alan, linux-kernel
Hi James,
On Thu, 25 Jul 2002, James Cleverdon wrote:
> On our NUMA P6 box, we found that the local APICs would occasionally start
> spasming with error interrupts. An APIC bus analyzer didn't show any kind of
> errors on the APIC bus. They would just weird out and all attempts to clear
> the error had no effect. We never did find a solution to that one or get an
> adequate explanation from Intel. The only kludge that worked was to turn off
> the APIC error interrupt.
>
> Naturally, the cleaned up version of the apic_state_dump patch wouldn't do
> that, or would make it an option.
Since you have the bus analyzer, how frequent (if at all) have you seen
the EOI register being written to without any bit set in the ISR?
Thanks,
Zwane
--
function.linuxpower.ca
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2002-07-26 10:10 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-07-23 4:21 Summit patch for 2.4.19-rc3-ac2 James Cleverdon
2002-07-23 8:51 ` Lech Szychowski
2002-07-23 12:03 ` 2.4.19-rc3-ac2 SMP Zwane Mwaikambo
2002-07-23 12:11 ` Zwane Mwaikambo
2002-07-23 18:50 ` James Cleverdon
2002-07-24 15:26 ` Zwane Mwaikambo
2002-07-24 22:50 ` James Cleverdon
2002-07-25 3:34 ` James Cleverdon
2002-07-25 7:11 ` Zwane Mwaikambo
2002-07-25 20:29 ` James Cleverdon
2002-07-25 13:26 ` Zwane Mwaikambo
2002-07-23 13:30 ` Summit patch for 2.4.19-rc3-ac2 James Bourne
2002-07-23 13:42 ` Steven Cole
2002-07-23 14:34 ` Philippe Gramoullé
-- strict thread matches above, loose matches on Subject: below --
2002-07-24 17:28 2.4.19-rc3-ac2 SMP Mikael Pettersson
2002-07-25 20:48 ` James Cleverdon
2002-07-26 10:31 ` Zwane Mwaikambo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox