[patch] x86_64, irq: check remote IRR bit before migrating level triggered irq

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
@ 2007-05-17 23:03 Siddha, Suresh B
  2007-05-17 23:58 ` Eric W. Biederman
                   ` (3 more replies)
  0 siblings, 4 replies; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-17 23:03 UTC (permalink / raw)
  To: mingo, ebiederm, ak
  Cc: akpm, linux-kernel, nanhai.zou, asit.k.mallick, keith.packard

on x86_64 kernel, level triggered irq migration gets initiated in the context
of that interrupt(after executing the irq handler) and following steps are
followed to do the irq migration.

1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
2. EOI;                       // processor EOI write
3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
                              // and interrupt vector due to per cpu vector
                              // allocation.
4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE

Because of the per cpu vector allocation in x86_64 kernels, when the irq
migrates to a different cpu, new vector(corresponding to the new cpu) will
get allocated.

An EOI write to local APIC has a side effect of generating an EOI write 
for level trigger interrupts (normally this is a broadcast to all IOAPICs). 
The EOI broadcast generated as a side effect of EOI write to processor may 
be delayed while the other IOAPIC writes (step 3 and 4) can go through.

Normally, the EOI generated by local APIC for level trigger interrupt
contains vector number. The IOAPIC will take this vector number and
search the IOAPIC RTE entries for an entry with matching vector number and
clear the remote IRR bit (indicate EOI). However, if the vector number is
changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
is received later. This will cause the remote IRR to get stuck causing the
interrupt hang (no more interrupt from this RTE).

Current x86_64 kernel assumes that remote IRR bit is cleared by the time
IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR
bit and if it still set, delay the irq migration to the next interrupt
arrival event(hopefully, next time remote IRR bit will get cleared
before the IOAPIC RTE is reprogrammed).

Initial analysis and patch from Nanhai.

Clean up patch from Suresh.

Signed-off-by: Nanhai Zou <nanhai.zou@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Asit Mallick <asit.k.mallick@intel.com>
Cc: Keith Packard <keith.packard@intel.com>
---

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 2a2df14..632ec57 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -163,7 +163,7 @@ static inline void io_apic_sync(unsigned int apic)
 	readl(&io_apic->data);
 }
 
-#define __DO_ACTION(R, ACTION, FINAL)					\
+#define __DO_ACTION(R, ACTION, FINAL, CHECKREG, VAL)			\
 									\
 {									\
 	int pin;							\
@@ -177,12 +177,18 @@ static inline void io_apic_sync(unsigned int apic)
 			break;						\
 		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
 		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		FINAL;							\
+		if (CHECKREG) {						\
+			if (reg == VAL)					\
+				return 1;				\
+		} else {						\
+			io_apic_modify(entry->apic, reg);		\
+			FINAL;						\
+		}							\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
 	}								\
+	return 0;							\
 }
 
 union entry_union {
@@ -241,6 +247,13 @@ static void ioapic_mask_entry(int apic, int pin)
 }
 
 #ifdef CONFIG_SMP
+/*
+ * If its Level triggered and RemoteIRR is still set?
+ */
+static int pending_eoi(unsigned int irq)
+{
+	__DO_ACTION(0, &= PENDING_EOI, , 1, PENDING_EOI);
+}
 static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
 {
 	int apic, pin;
@@ -271,6 +284,16 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 	unsigned int dest;
 	cpumask_t tmp;
 
+	/*
+	 * If the EOI still didn't reach the RTE corresponding to the
+	 * level triggered irq, postpone the irq migration to the next
+	 * irq arrival event.
+	 */
+	if (pending_eoi(irq)) {
+		irq_desc[irq].status |= IRQ_MOVE_PENDING;
+		return;
+	}
+
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
@@ -320,8 +343,8 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
-	static void name##_IO_APIC_irq (unsigned int irq)		\
-	__DO_ACTION(R, ACTION, FINAL)
+	static int name##_IO_APIC_irq (unsigned int irq)		\
+	__DO_ACTION(R, ACTION, FINAL, 0, 0)
 
 DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
 						/* mask = 1 */
diff --git a/include/asm-x86_64/io_apic.h b/include/asm-x86_64/io_apic.h
index 969d225..94b9a74 100644
--- a/include/asm-x86_64/io_apic.h
+++ b/include/asm-x86_64/io_apic.h
@@ -90,6 +90,8 @@ struct IO_APIC_route_entry {
 		dest		:  8;
 } __attribute__ ((packed));
 
+#define PENDING_EOI	(1 << 14 | 1 << 15)
+
 /*
  * MP-BIOS irq configuration table structures:
  */
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 77b7acc..acc56aa 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -55,7 +55,6 @@ void move_masked_irq(int irq)
 	if (likely(!cpus_empty(tmp))) {
 		desc->chip->set_affinity(irq,tmp);
 	}
-	cpus_clear(irq_desc[irq].pending_mask);
 }
 
 void move_native_irq(int irq)

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-17 23:03 [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq Siddha, Suresh B
@ 2007-05-17 23:58 ` Eric W. Biederman
  2007-05-18  0:43   ` Siddha, Suresh B
  2007-05-18  0:30 ` Eric W. Biederman
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-17 23:58 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: mingo, ak, akpm, linux-kernel, nanhai.zou, asit.k.mallick,
	keith.packard

"Siddha, Suresh B" <suresh.b.siddha@intel.com> writes:

> on x86_64 kernel, level triggered irq migration gets initiated in the context
> of that interrupt(after executing the irq handler) and following steps are
> followed to do the irq migration.
>
> 1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
> 2. EOI;                       // processor EOI write
> 3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
>                               // and interrupt vector due to per cpu vector
>                               // allocation.
> 4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE
>
> Because of the per cpu vector allocation in x86_64 kernels, when the irq
> migrates to a different cpu, new vector(corresponding to the new cpu) will
> get allocated.
>
> An EOI write to local APIC has a side effect of generating an EOI write 
> for level trigger interrupts (normally this is a broadcast to all IOAPICs). 
> The EOI broadcast generated as a side effect of EOI write to processor may 
> be delayed while the other IOAPIC writes (step 3 and 4) can go through.
>
> Normally, the EOI generated by local APIC for level trigger interrupt
> contains vector number. The IOAPIC will take this vector number and
> search the IOAPIC RTE entries for an entry with matching vector number and
> clear the remote IRR bit (indicate EOI). However, if the vector number is
> changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
> is received later. This will cause the remote IRR to get stuck causing the
> interrupt hang (no more interrupt from this RTE).
>
> Current x86_64 kernel assumes that remote IRR bit is cleared by the time
> IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR
> bit and if it still set, delay the irq migration to the next interrupt
> arrival event(hopefully, next time remote IRR bit will get cleared
> before the IOAPIC RTE is reprogrammed).
>
> Initial analysis and patch from Nanhai.

In essence this makes sense, and it may be the best work around for
buggy hardware available.   However I am not convinced that the remote
IRR on ioapics works reliably enough to be used for anything.   I
tested this earlier and I could not successfully poll the remote irr
bit to see if an ioapic had received an irq acknowledgement.  Instead
I locked up irq controllers.

If remote IRR worked reliably I could have pulled the irq migration
out of irq context.  So this fix looks dubious to me.

Why is the EOI delayed?  Can we work around that?

It would be nice if ioapics and local apics actually obeyed the pci
ordering rules where a read would flush all traffic.  And we do have a
read in there.

I'm assuming the symptom you are seeing on current kernels is that occasionally
the irq gets stuck and never fires again?

I'm not certain I like the patch either, but I need to look more closely.
You are mixing changes to generic and arch specific code together.

I think pending_eoi should not try to reuse __DO_ACTION as that
helper bit of code does not seem appropriate.

It would probably be best if the pending_eoi check was in
ack_apic_level with the rest of the weird logic we are working around.

Could we please have more detail on this hardware behavior.  Why is
the EIO write write delayed?   Why can't we just issue reads in
appropriate places to flush the write?

I need to think about this some more to see if there are any other
possible ways we could address this issue that would be more robust.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-17 23:03 [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq Siddha, Suresh B
  2007-05-17 23:58 ` Eric W. Biederman
@ 2007-05-18  0:30 ` Eric W. Biederman
  2007-05-18  1:01   ` Siddha, Suresh B
  2007-05-18 12:00 ` Andi Kleen
  2007-05-31 11:34 ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2) Eric W. Biederman
  3 siblings, 1 reply; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-18  0:30 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: mingo, ak, akpm, linux-kernel, nanhai.zou, asit.k.mallick,
	keith.packard

"Siddha, Suresh B" <suresh.b.siddha@intel.com> writes:

> on x86_64 kernel, level triggered irq migration gets initiated in the context
> of that interrupt(after executing the irq handler) and following steps are
> followed to do the irq migration.
>
> 1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
> 2. EOI;                       // processor EOI write
> 3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
>                               // and interrupt vector due to per cpu vector
>                               // allocation.
> 4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE
>
> Because of the per cpu vector allocation in x86_64 kernels, when the irq
> migrates to a different cpu, new vector(corresponding to the new cpu) will
> get allocated.
>
> An EOI write to local APIC has a side effect of generating an EOI write 
> for level trigger interrupts (normally this is a broadcast to all IOAPICs). 
> The EOI broadcast generated as a side effect of EOI write to processor may 
> be delayed while the other IOAPIC writes (step 3 and 4) can go through.
>
> Normally, the EOI generated by local APIC for level trigger interrupt
> contains vector number. The IOAPIC will take this vector number and
> search the IOAPIC RTE entries for an entry with matching vector number and
> clear the remote IRR bit (indicate EOI). However, if the vector number is
> changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
> is received later. This will cause the remote IRR to get stuck causing the
> interrupt hang (no more interrupt from this RTE).
>
> Current x86_64 kernel assumes that remote IRR bit is cleared by the time
> IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR
> bit and if it still set, delay the irq migration to the next interrupt
> arrival event(hopefully, next time remote IRR bit will get cleared
> before the IOAPIC RTE is reprogrammed).
>
> Initial analysis and patch from Nanhai.

So why does any of this matter? 

My memory says that the ioapic state for sending irqs gets reset when we
unmask the irq.

If not I expect we can use the mask and edge, unmask and level
work-around from arch/i386/ioapic.c.  That looks like it would
be less code, less error prone and easier to implement then the
current work around.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-17 23:58 ` Eric W. Biederman
@ 2007-05-18  0:43   ` Siddha, Suresh B
  0 siblings, 0 replies; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-18  0:43 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

On Thu, May 17, 2007 at 04:58:02PM -0700, Eric W. Biederman wrote:
> In essence this makes sense, and it may be the best work around for
> buggy hardware available.   However I am not convinced that the remote
> IRR on ioapics works reliably enough to be used for anything.   I
> tested this earlier and I could not successfully poll the remote irr
> bit to see if an ioapic had received an irq acknowledgement.  Instead
> I locked up irq controllers.
> 
> If remote IRR worked reliably I could have pulled the irq migration
> out of irq context.  So this fix looks dubious to me.

We are just reading IOAPIC RTE's. So not sure why this can lead to lock
up's. This patch is not reading any new HW registers. It just
checks for certain bit values from the register that the code reads
anyhow.

Remote IRR should work reliably otherwise we will see lot more issues,
I guess.

> 
> Why is the EOI delayed?  Can we work around that?

Under some stress conditions, EOI can get retried because of which
it is getting delayed.

> 
> It would be nice if ioapics and local apics actually obeyed the pci
> ordering rules where a read would flush all traffic.  And we do have a
> read in there.
> 
> I'm assuming the symptom you are seeing on current kernels is that
> occasionally
> the irq gets stuck and never fires again?

yes.

> I'm not certain I like the patch either, but I need to look more closely.
> You are mixing changes to generic and arch specific code together.
> 
> I think pending_eoi should not try to reuse __DO_ACTION as that
> helper bit of code does not seem appropriate.

Wanted to minimize the code duplicacy and hence overloaded existing
macros.

> 
> It would probably be best if the pending_eoi check was in
> ack_apic_level with the rest of the weird logic we are working around.

Just wanted to make sure that we give some time for the EOI to actually
reach the IO-APIC.

> 
> Could we please have more detail on this hardware behavior.  Why is
> the EIO write write delayed?   Why can't we just issue reads in
> appropriate places to flush the write?

Because the EOI to IOAPIC gets generated as a side effect of processor
EOI.

thanks,
suresh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18  0:30 ` Eric W. Biederman
@ 2007-05-18  1:01   ` Siddha, Suresh B
  2007-05-18 14:40     ` Eric W. Biederman
  0 siblings, 1 reply; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-18  1:01 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

On Thu, May 17, 2007 at 05:30:13PM -0700, Eric W. Biederman wrote:
> So why does any of this matter?
>
> My memory says that the ioapic state for sending irqs gets reset when we
> unmask the irq.

No. Atleast not on the platform we have tested.

> If not I expect we can use the mask and edge, unmask and level
> work-around from arch/i386/ioapic.c.  That looks like it would
> be less code, less error prone and easier to implement then the
> current work around.

hmm. We have to check and see if it can help. But ioapic spec doesn't
say that the Remote IRR gets reset when the trigger mode changes. Not sure
if we can apply the logic widely.

thanks,
suresh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-17 23:03 [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq Siddha, Suresh B
  2007-05-17 23:58 ` Eric W. Biederman
  2007-05-18  0:30 ` Eric W. Biederman
@ 2007-05-18 12:00 ` Andi Kleen
  2007-05-18 14:19   ` Eric W. Biederman
  2007-05-31 11:34 ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2) Eric W. Biederman
  3 siblings, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2007-05-18 12:00 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: mingo, ebiederm, akpm, linux-kernel, nanhai.zou, asit.k.mallick,
	keith.packard

On Friday 18 May 2007 01:03, Siddha, Suresh B wrote:

> Normally, the EOI generated by local APIC for level trigger interrupt
> contains vector number. The IOAPIC will take this vector number and
> search the IOAPIC RTE entries for an entry with matching vector number and
> clear the remote IRR bit (indicate EOI). However, if the vector number is
> changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
> is received later. This will cause the remote IRR to get stuck causing the
> interrupt hang (no more interrupt from this RTE).

Does this happen often or did you only see it in some extreme or obscure
case?

> +	/*
> +	 * If the EOI still didn't reach the RTE corresponding to the
> +	 * level triggered irq, postpone the irq migration to the next
> +	 * irq arrival event.
> +	 */
> +	if (pending_eoi(irq)) {
> +		irq_desc[irq].status |= IRQ_MOVE_PENDING;
> +		return;

Other code seems to have similar problems, but we don't have any lock
protecting that bitmap against parallel updates outside the irq itself, don't 
we? Perhaps it needs to be all set_bit()

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 12:00 ` Andi Kleen
@ 2007-05-18 14:19   ` Eric W. Biederman
  0 siblings, 0 replies; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-18 14:19 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Siddha, Suresh B, mingo, akpm, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard

Andi Kleen <ak@suse.de> writes:

> On Friday 18 May 2007 01:03, Siddha, Suresh B wrote:
>
>> Normally, the EOI generated by local APIC for level trigger interrupt
>> contains vector number. The IOAPIC will take this vector number and
>> search the IOAPIC RTE entries for an entry with matching vector number and
>> clear the remote IRR bit (indicate EOI). However, if the vector number is
>> changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
>> is received later. This will cause the remote IRR to get stuck causing the
>> interrupt hang (no more interrupt from this RTE).
>
> Does this happen often or did you only see it in some extreme or obscure
> case?

It is obscure.  Someone may have a case that makes it easy to
reproduce but it won't happen frequently.

So we should have enough time to review and fix this carefully,
instead of needing to rush a fix out.

>> +	/*
>> +	 * If the EOI still didn't reach the RTE corresponding to the
>> +	 * level triggered irq, postpone the irq migration to the next
>> +	 * irq arrival event.
>> +	 */
>> +	if (pending_eoi(irq)) {
>> +		irq_desc[irq].status |= IRQ_MOVE_PENDING;
>> +		return;
>
> Other code seems to have similar problems, but we don't have any lock
> protecting that bitmap against parallel updates outside the irq itself, don't 
> we? Perhaps it needs to be all set_bit()

Huh?  The irq lock is sufficient for this.

This is not a code software problem, this is a software hardware
interaction problem.  This is a problem in that there is no way to
guarantee that ioapics see a series of manipulations in the same
order that the cpu issues those manipulations.

In particular the ioapic is seeing the ack after we reprogram
the ioapic, which causes the ioapic to not accept the ack (because
it's vector number changed) and then the ioapic waits forever
for an ack that isn't coming.

The other half of the problem is pending_eoi() is testing a bit in the
hardware that in my experience is not sufficient to guarantee that
the hardware is in a state where action may be taken.

This is a very delicate problem because there is little if any guaranteed
ordering between cpu operations and ioapic operations, especially because
there are two channels of communication the ``apic bus'' and the memory
mapped ioapic registers.  So what happens on one channel has no ordering
requirements of any kind with what happens on the other channel.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18  1:01   ` Siddha, Suresh B
@ 2007-05-18 14:40     ` Eric W. Biederman
  2007-05-18 17:30       ` Yinghai Lu
  2007-05-18 18:07       ` Siddha, Suresh B
  0 siblings, 2 replies; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-18 14:40 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: mingo, ak, akpm, linux-kernel, Zou, Nanhai, Mallick, Asit K,
	Packard, Keith

"Siddha, Suresh B" <suresh.b.siddha@intel.com> writes:

> On Thu, May 17, 2007 at 05:30:13PM -0700, Eric W. Biederman wrote:
>> So why does any of this matter?
>>
>> My memory says that the ioapic state for sending irqs gets reset when we
>> unmask the irq.
>
> No. Atleast not on the platform we have tested.

Ok.  It simply have the test for an external interrupt getting reset
not the delivery state machine.  Otherwise you would not have a
problem in the first place.

>> If not I expect we can use the mask and edge, unmask and level
>> work-around from arch/i386/ioapic.c.  That looks like it would
>> be less code, less error prone and easier to implement then the
>> current work around.
>
> hmm. We have to check and see if it can help. But ioapic spec doesn't
> say that the Remote IRR gets reset when the trigger mode changes. Not sure
> if we can apply the logic widely.

Worst case it is a noop, so wide deployment should be safe.

The original definition of the Remote IRR is:

> 14 Remote IRR--RO. This bit is used for level triggered
>    interrupts. Its meaning is undefined for edge triggered
>    interrupts. For level triggered interrupts, this bit is set to 1
>    when local APIC(s) accept the level interrupt sent by the
>    IOAPIC. The Remote IRR bit is set to 0 when an EOI message with a
>    matching interrupt vector is received from a local APIC.

We know it is not used to throttle edge triggered interrupts.

The question is how does the hardware implementation make that come
about?   By ignoring Remote IRR when transmitting edge triggered
interrupts?  By insisting Remote IRR is clear when doing a mode
transition?  By clearing Remote IRR when doing a mode transition?

At least historically there is some evidence that Intel's ioapics
did the latter.  I don't know if there is a defacto standard
for how ioapic mode transitions happen.

Still in any of those I don't see a problem with switching to edge
triggered mode and then back again.  Either Remote IRR will keep
it's current state or it will be cleared.   Remote IRR should not
get set (when it was clear) by such a procedure because that
would mess up the normal interrupt enable sequence that happens
on boot.  So I'm pretty certain toggling the edge bit is harmless
and it may actually clear Remote IRR for us.

This does seem to be a general applicable problem (at least in theory)
because we have no useful ordering rules we can take advantage of
so something that works across the board does make sense.

With a little care I think we can safely send an ack to the new
vector just in case this race triggers.  Before we send an
ipi to ourselves we need to ensure in the vector allocator that
the vector is either just for level or just for edge triggered
interrupts.  Then we can just send an ipi to our current cpu
ack it and we ensure the interrupt it acked.

"Siddha, Suresh B" <suresh.b.siddha@intel.com> writes:

> On Thu, May 17, 2007 at 04:58:02PM -0700, Eric W. Biederman wrote:
>> In essence this makes sense, and it may be the best work around for
>> buggy hardware available.   However I am not convinced that the remote
>> IRR on ioapics works reliably enough to be used for anything.   I
>> tested this earlier and I could not successfully poll the remote irr
>> bit to see if an ioapic had received an irq acknowledgement.  Instead
>> I locked up irq controllers.
>> 
>> If remote IRR worked reliably I could have pulled the irq migration
>> out of irq context.  So this fix looks dubious to me.
>
> We are just reading IOAPIC RTE's. So not sure why this can lead to lock
> up's. This patch is not reading any new HW registers. It just
> checks for certain bit values from the register that the code reads
> anyhow.
>
> Remote IRR should work reliably otherwise we will see lot more issues,
> I guess.

The test I performed was outside of interrupt context to mask an
interrupt, poll Remote IRR until it was clear, and them migrate
the interrupt.  That does not work correctly.  If that could
be made to work correctly we would not need to interrupt migration
in interrupt context.

So while that is not conclusive evidence that Remote IRR is not a
reliable indication of when it is safe to do things it is a strong
suggestion that it is not a good indicator.  Currently we don't
look at Remote IRR so we have no evidence that it is good for anything.

>> Why is the EOI delayed?  Can we work around that?
>
> Under some stress conditions, EOI can get retried because of which
> it is getting delayed.

Ok.  So that is why EOI is delayed.

>> It would be nice if ioapics and local apics actually obeyed the pci
>> ordering rules where a read would flush all traffic.  And we do have a
>> read in there.
>> 
>> I'm assuming the symptom you are seeing on current kernels is that
>> occasionally
>> the irq gets stuck and never fires again?
>
> yes.
>
>> I'm not certain I like the patch either, but I need to look more closely.
>> You are mixing changes to generic and arch specific code together.
>> 
>> I think pending_eoi should not try to reuse __DO_ACTION as that
>> helper bit of code does not seem appropriate.
>
> Wanted to minimize the code duplicacy and hence overloaded existing
> macros.

I think going more the way that this code has gone on arch/i386 with
real functions is preferable.

>> It would probably be best if the pending_eoi check was in
>> ack_apic_level with the rest of the weird logic we are working around.
>
> Just wanted to make sure that we give some time for the EOI to actually
> reach the IO-APIC.

If that is what we want an explicit delay is likely better then an
implicit wait induced by a function call.  A couple of extra
instructions isn't likely to make a really noticeable difference
to the slow interrupt controllers, and it makes the code much
less readable.

>> Could we please have more detail on this hardware behavior.  Why is
>> the EIO write write delayed?   Why can't we just issue reads in
>> appropriate places to flush the write?
>
> Because the EOI to IOAPIC gets generated as a side effect of processor
> EOI.

I agree with that.  But it would be useful if someone defined a
sequence such that we could read from the local apic and the from from
the ioapics to ensure we flush all traffic.  Not being able to
synchronize with an ioapic generates no end of challenges.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 14:40     ` Eric W. Biederman
@ 2007-05-18 17:30       ` Yinghai Lu
  2007-05-18 18:01         ` Eric W. Biederman
  2007-05-18 18:07       ` Siddha, Suresh B
  1 sibling, 1 reply; 26+ messages in thread
From: Yinghai Lu @ 2007-05-18 17:30 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

Eric,

ioapic_level irq is limited, So if we keep vector number not changed
when imgration to other cpus.  It that could help.

it will need modify a little with assign_irq_vector and
irq_complete_move/smp_irq_move_cleanup_interrupt. because it assume
vector must be changed.

YH

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 17:30       ` Yinghai Lu
@ 2007-05-18 18:01         ` Eric W. Biederman
  2007-05-18 18:09           ` Yinghai Lu
  0 siblings, 1 reply; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-18 18:01 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

"Yinghai Lu" <yhlu.kernel@gmail.com> writes:

> Eric,
>
> ioapic_level irq is limited, So if we keep vector number not changed
> when imgration to other cpus.  It that could help.

We can solve the problem without doing that, and keeping the same
vector number during migration keeps x86 from scaling.  Personally
I would prefer to disallow irq migration.

> it will need modify a little with assign_irq_vector and
> irq_complete_move/smp_irq_move_cleanup_interrupt. because it assume
> vector must be changed.

Yes it does.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 14:40     ` Eric W. Biederman
  2007-05-18 17:30       ` Yinghai Lu
@ 2007-05-18 18:07       ` Siddha, Suresh B
  2007-05-18 18:28         ` Yinghai Lu
  2007-05-31  2:50         ` Eric W. Biederman
  1 sibling, 2 replies; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-18 18:07 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

Eric,

On Fri, May 18, 2007 at 07:40:53AM -0700, Eric W. Biederman wrote:
> Still in any of those I don't see a problem with switching to edge
> triggered mode and then back again.  Either Remote IRR will keep
> it's current state or it will be cleared.   Remote IRR should not
> get set (when it was clear) by such a procedure because that
> would mess up the normal interrupt enable sequence that happens
> on boot.  So I'm pretty certain toggling the edge bit is harmless
> and it may actually clear Remote IRR for us.
...
> 
> I think going more the way that this code has gone on arch/i386 with
> real functions is preferable.

There is an issue with this suggestion. We have an inflight
EOI broadcast msg to this IOAPIC (that got delayed but still alive) and
that can cause problem if we use edge and back to level to reset remote IRR bit.

If the vector number stays same during irq migration and if we reset remote
IRR bit using the above method(edge and then back to level) during
irq migration, then we have a problem. A new interrupt arriving on a new
cpu will set the remote IRR bit and now the old inflight EOI broadcast
reaches IOAPIC RTE(resetting the remote IRR bit, because the vector in the
broadcast msg is same), while the kernel code still assumes that the remote
IRR bit is still set. This will lead to more problems and issues.

thanks,
suresh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 18:01         ` Eric W. Biederman
@ 2007-05-18 18:09           ` Yinghai Lu
  2007-05-18 18:20             ` Eric W. Biederman
  0 siblings, 1 reply; 26+ messages in thread
From: Yinghai Lu @ 2007-05-18 18:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

On 5/18/07, Eric W. Biederman <ebiederm@xmission.com> wrote:
> We can solve the problem without doing that, and keeping the same
> vector number during migration keeps x86 from scaling.

I mean ioapic level irq couls be limited. new device could use MSI or
HT irq directly and less irq routing problem.

> Personally I would prefer to disallow irq migration.
? typo?
For amd platform with different hypertransport chain on different
nodes, irq migration is needed.

YH

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 18:09           ` Yinghai Lu
@ 2007-05-18 18:20             ` Eric W. Biederman
  0 siblings, 0 replies; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-18 18:20 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Siddha, Suresh B, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

"Yinghai Lu" <yhlu.kernel@gmail.com> writes:

> On 5/18/07, Eric W. Biederman <ebiederm@xmission.com> wrote:
>> We can solve the problem without doing that, and keeping the same
>> vector number during migration keeps x86 from scaling.
>
> I mean ioapic level irq couls be limited. new device could use MSI or
> HT irq directly and less irq routing problem.

Possibly.  It really doesn't buy us anything until most irqs are MSI
which they are not yet.

>> Personally I would prefer to disallow irq migration.
> ? typo?
> For amd platform with different hypertransport chain on different
> nodes, irq migration is needed.

irqs not on cpu0 are needed.  irq migration is less necessary, and I periodically
think we are insane for supporting it.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 18:07       ` Siddha, Suresh B
@ 2007-05-18 18:28         ` Yinghai Lu
  2007-05-18 18:39           ` Siddha, Suresh B
  2007-05-31  2:50         ` Eric W. Biederman
  1 sibling, 1 reply; 26+ messages in thread
From: Yinghai Lu @ 2007-05-18 18:28 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Eric W. Biederman, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

On 5/18/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
>
> If the vector number stays same during irq migration and if we reset remote
> IRR bit using the above method(edge and then back to level) during
> irq migration, then we have a problem. A new interrupt arriving on a new
> cpu will set the remote IRR bit and now the old inflight EOI broadcast
> reaches IOAPIC RTE(resetting the remote IRR bit, because the vector in the
> broadcast msg is same), while the kernel code still assumes that the remote
> IRR bit is still set. This will lead to more problems and issues.

coud add some line __assign_irq_vector. to make sure old_vector!=vector.

YH

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 18:28         ` Yinghai Lu
@ 2007-05-18 18:39           ` Siddha, Suresh B
  2007-05-18 19:02             ` Eric W. Biederman
  0 siblings, 1 reply; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-18 18:39 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Siddha, Suresh B, Eric W. Biederman, mingo, ak, akpm,
	linux-kernel, Zou, Nanhai, Mallick, Asit K, Packard, Keith

On Fri, May 18, 2007 at 11:28:25AM -0700, Yinghai Lu wrote:
> On 5/18/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
> >
> > If the vector number stays same during irq migration and if we reset remote
> > IRR bit using the above method(edge and then back to level) during
> > irq migration, then we have a problem. A new interrupt arriving on a new
> > cpu will set the remote IRR bit and now the old inflight EOI broadcast
> > reaches IOAPIC RTE(resetting the remote IRR bit, because the vector in the
> > broadcast msg is same), while the kernel code still assumes that the remote
> > IRR bit is still set. This will lead to more problems and issues.
> 
> coud add some line __assign_irq_vector. to make sure old_vector!=vector.

hmm..
what happens when there is second(and very quick) irq migration which brings the
irq back to old cpu(or to a third cpu) with old vector.

Point is, we are not taking care of the inflight messages(which can perhaps,
theoretically, can get delayed for long time)

thanks,
suresh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 18:39           ` Siddha, Suresh B
@ 2007-05-18 19:02             ` Eric W. Biederman
  2007-05-18 19:18               ` Siddha, Suresh B
  0 siblings, 1 reply; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-18 19:02 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: Yinghai Lu, mingo, ak, akpm, linux-kernel, Zou, Nanhai,
	Mallick, Asit K, Packard, Keith

"Siddha, Suresh B" <suresh.b.siddha@intel.com> writes:

> On Fri, May 18, 2007 at 11:28:25AM -0700, Yinghai Lu wrote:
>> On 5/18/07, Siddha, Suresh B <suresh.b.siddha@intel.com> wrote:
>> >
>> > If the vector number stays same during irq migration and if we reset remote
>> > IRR bit using the above method(edge and then back to level) during
>> > irq migration, then we have a problem. A new interrupt arriving on a new
>> > cpu will set the remote IRR bit and now the old inflight EOI broadcast
>> > reaches IOAPIC RTE(resetting the remote IRR bit, because the vector in the
>> > broadcast msg is same), while the kernel code still assumes that the remote
>> > IRR bit is still set. This will lead to more problems and issues.
>> 
>> coud add some line __assign_irq_vector. to make sure old_vector!=vector.
>
> hmm..
> what happens when there is second(and very quick) irq migration which brings the
> irq back to old cpu(or to a third cpu) with old vector.
>
> Point is, we are not taking care of the inflight messages(which can perhaps,
> theoretically, can get delayed for long time)

I will look closer but I do believe that from the ioapic to the cpu the 2.6.21
code should be fairly robust with respect to inflight messages from the ioapic
to the local apics and the cpus.  What I failed to consider were inflight
messages in the other direction arriving out of order.

Part of the problem in the general case is the only way you can tell an inflight
message was transmitted is that a message that you can prove followed the first
message arrives somewhere.

I'm in the middle of tracking two other problems so I can't review this in
detail until later today at the earliest.
 
Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 19:02             ` Eric W. Biederman
@ 2007-05-18 19:18               ` Siddha, Suresh B
  0 siblings, 0 replies; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-18 19:18 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Siddha, Suresh B, Yinghai Lu, mingo, ak, akpm, linux-kernel,
	Zou, Nanhai, Mallick, Asit K, Packard, Keith

On Fri, May 18, 2007 at 12:02:16PM -0700, Eric W. Biederman wrote:
> I will look closer but I do believe that from the ioapic to the cpu the 2.6.21
> code should be fairly robust with respect to inflight messages from the ioapic
> to the local apics and the cpus.  What I failed to consider were inflight
> messages in the other direction arriving out of order.

Yes. Inflight messages from cpu to ioapic was what I was referring to.

thanks,
suresh

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq
  2007-05-18 18:07       ` Siddha, Suresh B
  2007-05-18 18:28         ` Yinghai Lu
@ 2007-05-31  2:50         ` Eric W. Biederman
  1 sibling, 0 replies; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-31  2:50 UTC (permalink / raw)
  To: Siddha, Suresh B
  Cc: mingo, ak, akpm, linux-kernel, Zou, Nanhai, Mallick, Asit K,
	Packard, Keith

"Siddha, Suresh B" <suresh.b.siddha@intel.com> writes:

> Eric,
>
> On Fri, May 18, 2007 at 07:40:53AM -0700, Eric W. Biederman wrote:
>> Still in any of those I don't see a problem with switching to edge
>> triggered mode and then back again.  Either Remote IRR will keep
>> it's current state or it will be cleared.   Remote IRR should not
>> get set (when it was clear) by such a procedure because that
>> would mess up the normal interrupt enable sequence that happens
>> on boot.  So I'm pretty certain toggling the edge bit is harmless
>> and it may actually clear Remote IRR for us.
> ...
>> 
>> I think going more the way that this code has gone on arch/i386 with
>> real functions is preferable.
>
> There is an issue with this suggestion. We have an inflight
> EOI broadcast msg to this IOAPIC (that got delayed but still alive) and
> that can cause problem if we use edge and back to level to reset remote IRR bit.
>
> If the vector number stays same during irq migration and if we reset remote
> IRR bit using the above method(edge and then back to level) during
> irq migration, then we have a problem. A new interrupt arriving on a new
> cpu will set the remote IRR bit and now the old inflight EOI broadcast
> reaches IOAPIC RTE(resetting the remote IRR bit, because the vector in the
> broadcast msg is same), while the kernel code still assumes that the remote
> IRR bit is still set. This will lead to more problems and issues.

Ok.  I will agree that if the level->edge->level switch does not reset
Remote_IRR and the EOI arrives in that window.  We have another condition
in which Remote_IRR can get stuck on.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2)
  2007-05-17 23:03 [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq Siddha, Suresh B
                   ` (2 preceding siblings ...)
  2007-05-18 12:00 ` Andi Kleen
@ 2007-05-31 11:34 ` Eric W. Biederman
  2007-05-31 12:01   ` Andi Kleen
  2007-05-31 13:34   ` Ingo Molnar
  3 siblings, 2 replies; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-31 11:34 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Siddha, Suresh B, mingo, ak, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard, Yinghai Lu


On x86_64 kernel, level triggered irq migration gets initiated in the context
of that interrupt(after executing the irq handler) and following steps are
followed to do the irq migration.

1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
2. EOI;                       // processor EOI write
3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
                              // and interrupt vector due to per cpu vector
                              // allocation.
4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE

Because of the per cpu vector allocation in x86_64 kernels, when the irq
migrates to a different cpu, new vector(corresponding to the new cpu) will
get allocated.

An EOI write to local APIC has a side effect of generating an EOI write
for level trigger interrupts (normally this is a broadcast to all IOAPICs).
The EOI broadcast generated as a side effect of EOI write to processor may
be delayed while the other IOAPIC writes (step 3 and 4) can go through.

Normally, the EOI generated by local APIC for level trigger interrupt
contains vector number. The IOAPIC will take this vector number and
search the IOAPIC RTE entries for an entry with matching vector number and
clear the remote IRR bit (indicate EOI). However, if the vector number is
changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
is received later. This will cause the remote IRR to get stuck causing the
interrupt hang (no more interrupt from this RTE).

Current x86_64 kernel assumes that remote IRR bit is cleared by the time
IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR
bit and if it still set, delay the irq migration to the next interrupt
arrival event(hopefully, next time remote IRR bit will get cleared
before the IOAPIC RTE is reprogrammed).

Initial analysis and patch from Nanhai.

Clean up patch from Suresh.

Rewritten to be less intrusive, and to contain a big fat comment by Eric.

Cc: Nanhai Zou <nanhai.zou@intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Keith Packard <keith.packard@intel.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86_64/kernel/io_apic.c |   57 ++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d8bfe31..2d52aed 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -152,6 +152,31 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value)
 	writel(value, &io_apic->data);
 }
 
+static int io_apic_level_ack_pending(unsigned int irq)
+{
+	struct irq_pin_list *entry;
+	unsigned long flags;
+	int pending = 0;
+	spin_lock_irqsave(&ioapic_lock, flags);
+	entry = irq_2_pin + irq;
+	for (;;) {
+		unsigned int reg;
+		int pin;
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		/* Is the remote IRR bit set? */
+		pending |= (reg >> 14) & 1;
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return pending;
+}
+
+
 /*
  * Synchronize the IO-APIC and the CPU by doing
  * a dummy read from the IO-APIC
@@ -1418,9 +1443,37 @@ static void ack_apic_level(unsigned int irq)
 	ack_APIC_irq();
 
 	/* Now we can move and renable the irq */
-	move_masked_irq(irq);
-	if (unlikely(do_unmask_irq))
+	if (unlikely(do_unmask_irq)) {
+		/* Only migrate the irq if the ack has been received.
+		 *
+		 * On rare occaions the broadcast level triggered ack gets
+		 * delayed going to ioapics, and if we reprogram the
+		 * vector while Remote IRR is still the irq will never
+		 * fire again.
+		 *
+		 * To prevent this scenario we read the Remote IRR bit
+		 * of the ioapic.  This has two affects.
+		 * - On any sane system the read of the ioapic will 
+		 *   flush writes (and acks) going to the ioapic from
+		 *   this cpu.
+		 * - We get to see if the ACK has actually been delivered.
+		 *
+		 * Based on failed experiments of reprogramming the
+		 * ioapic entry from outside of irq context starting
+		 * with masking the ioapic entry and then polling until
+		 * Remote IRR was clear before reprogramming the
+		 * ioapic I don't trust the Remote IRR bit to be
+		 * completey accurate. 
+		 *
+		 * However there appears to be no other way to plug
+		 * this race, so if the Remote IRR bit is not
+		 * accurate and causing problems it is a hardware bug
+		 * and you can go talk chipset vendor about it.
+		 */
+		if (!io_apic_level_ack_pending(irq))
+			move_masked_irq(irq);
 		unmask_IO_APIC_irq(irq);
+	}
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
-- 
1.5.1.1.181.g2de0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2)
  2007-05-31 11:34 ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2) Eric W. Biederman
@ 2007-05-31 12:01   ` Andi Kleen
  2007-05-31 13:29     ` Eric W. Biederman
  2007-05-31 13:34   ` Ingo Molnar
  1 sibling, 1 reply; 26+ messages in thread
From: Andi Kleen @ 2007-05-31 12:01 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Siddha, Suresh B, mingo, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard, Yinghai Lu

On Thursday 31 May 2007 13:34:21 Eric W. Biederman wrote:
> 
> On x86_64 kernel, level triggered irq migration gets initiated in the context
> of that interrupt(after executing the irq handler) and following steps are
> followed to do the irq migration.

What's the confidence level in the patch? Is it a .22 candidate?

-Andi

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2)
  2007-05-31 12:01   ` Andi Kleen
@ 2007-05-31 13:29     ` Eric W. Biederman
  2007-05-31 20:02       ` Siddha, Suresh B
  0 siblings, 1 reply; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-31 13:29 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andrew Morton, Siddha, Suresh B, mingo, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard, Yinghai Lu

Andi Kleen <ak@suse.de> writes:

> On Thursday 31 May 2007 13:34:21 Eric W. Biederman wrote:
>> 
>> On x86_64 kernel, level triggered irq migration gets initiated in the context
>> of that interrupt(after executing the irq handler) and following steps are
>> followed to do the irq migration.
>
> What's the confidence level in the patch? Is it a .22 candidate?

Yes.

I just ran it for a little while and nothing fell over, and level
triggered irqs continued to migrate.  So it's not horribly broken.

Outside of implementation errors the worst case that can happen with
this code is the current behavior, with a small slow down.

So since I can not reproduce the original problem I would appreciate
an eyeball or two more just to make certain I reimplemented this
correctly and didn't do anything stupid.   But I don't expect anything
to be found.  The code is pretty straight forward.

Eric

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2)
  2007-05-31 11:34 ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2) Eric W. Biederman
  2007-05-31 12:01   ` Andi Kleen
@ 2007-05-31 13:34   ` Ingo Molnar
  2007-05-31 13:50     ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3) Eric W. Biederman
  1 sibling, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2007-05-31 13:34 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Siddha, Suresh B, ak, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard, Yinghai Lu


looks good to me:

  Acked-by: Ingo Molnar <mingo@elte.hu>

with a few minor style nits:

> +static int io_apic_level_ack_pending(unsigned int irq)
> +{
> +	struct irq_pin_list *entry;
> +	unsigned long flags;
> +	int pending = 0;
> +	spin_lock_irqsave(&ioapic_lock, flags);

newline after variable sections please.

> +	entry = irq_2_pin + irq;
> +	for (;;) {
> +		unsigned int reg;
> +		int pin;
> +		pin = entry->pin;

ditto.

> +		if (pin == -1)
> +			break;
> +		reg = io_apic_read(entry->apic, 0x10 + pin*2);
> +		/* Is the remote IRR bit set? */
> +		pending |= (reg >> 14) & 1;
> +		if (!entry->next)
> +			break;
> +		entry = irq_2_pin + entry->next;
> +	}
> +	spin_unlock_irqrestore(&ioapic_lock, flags);
> +	return pending;

optional: it looks a bit better with a newline before the 'return' 
statement.

> +}
> +
> +

and here there's one too much newline :-)

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3)
  2007-05-31 13:34   ` Ingo Molnar
@ 2007-05-31 13:50     ` Eric W. Biederman
  2007-05-31 13:54       ` Ingo Molnar
  2007-05-31 20:00       ` Siddha, Suresh B
  0 siblings, 2 replies; 26+ messages in thread
From: Eric W. Biederman @ 2007-05-31 13:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ingo Molnar, Siddha, Suresh B, ak, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard, Yinghai Lu


On x86_64 kernel, level triggered irq migration gets initiated in the context
of that interrupt(after executing the irq handler) and following steps are
followed to do the irq migration.

1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
2. EOI;                       // processor EOI write
3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
                              // and interrupt vector due to per cpu vector
                              // allocation.
4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE

Because of the per cpu vector allocation in x86_64 kernels, when the irq
migrates to a different cpu, new vector(corresponding to the new cpu) will
get allocated.

An EOI write to local APIC has a side effect of generating an EOI write
for level trigger interrupts (normally this is a broadcast to all IOAPICs).
The EOI broadcast generated as a side effect of EOI write to processor may
be delayed while the other IOAPIC writes (step 3 and 4) can go through.

Normally, the EOI generated by local APIC for level trigger interrupt
contains vector number. The IOAPIC will take this vector number and
search the IOAPIC RTE entries for an entry with matching vector number and
clear the remote IRR bit (indicate EOI). However, if the vector number is
changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
is received later. This will cause the remote IRR to get stuck causing the
interrupt hang (no more interrupt from this RTE).

Current x86_64 kernel assumes that remote IRR bit is cleared by the time
IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR
bit and if it still set, delay the irq migration to the next interrupt
arrival event(hopefully, next time remote IRR bit will get cleared
before the IOAPIC RTE is reprogrammed).

Initial analysis and patch from Nanhai.

Clean up patch from Suresh.

Rewritten to be less intrusive, and to contain a big fat comment by Eric.

Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nanhai Zou <nanhai.zou@intel.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Keith Packard <keith.packard@intel.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86_64/kernel/io_apic.c |   58 ++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d8bfe31..7c032f2 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -152,6 +152,32 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value)
 	writel(value, &io_apic->data);
 }
 
+static int io_apic_level_ack_pending(unsigned int irq)
+{
+	struct irq_pin_list *entry;
+	unsigned long flags;
+	int pending = 0;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	entry = irq_2_pin + irq;
+	for (;;) {
+		unsigned int reg;
+		int pin;
+
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		reg = io_apic_read(entry->apic, 0x10 + pin*2);
+		/* Is the remote IRR bit set? */
+		pending |= (reg >> 14) & 1;
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	return pending;
+}
+
 /*
  * Synchronize the IO-APIC and the CPU by doing
  * a dummy read from the IO-APIC
@@ -1418,9 +1444,37 @@ static void ack_apic_level(unsigned int irq)
 	ack_APIC_irq();
 
 	/* Now we can move and renable the irq */
-	move_masked_irq(irq);
-	if (unlikely(do_unmask_irq))
+	if (unlikely(do_unmask_irq)) {
+		/* Only migrate the irq if the ack has been received.
+		 *
+		 * On rare occaions the broadcast level triggered ack gets
+		 * delayed going to ioapics, and if we reprogram the
+		 * vector while Remote IRR is still the irq will never
+		 * fire again.
+		 *
+		 * To prevent this scenario we read the Remote IRR bit
+		 * of the ioapic.  This has two affects.
+		 * - On any sane system the read of the ioapic will
+		 *   flush writes (and acks) going to the ioapic from
+		 *   this cpu.
+		 * - We get to see if the ACK has actually been delivered.
+		 *
+		 * Based on failed experiments of reprogramming the
+		 * ioapic entry from outside of irq context starting
+		 * with masking the ioapic entry and then polling until
+		 * Remote IRR was clear before reprogramming the
+		 * ioapic I don't trust the Remote IRR bit to be
+		 * completey accurate.
+		 *
+		 * However there appears to be no other way to plug
+		 * this race, so if the Remote IRR bit is not
+		 * accurate and causing problems it is a hardware bug
+		 * and you can go talk chipset vendor about it.
+		 */
+		if (!io_apic_level_ack_pending(irq))
+			move_masked_irq(irq);
 		unmask_IO_APIC_irq(irq);
+	}
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
-- 
1.5.1.1.181.g2de0


^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3)
  2007-05-31 13:50     ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3) Eric W. Biederman
@ 2007-05-31 13:54       ` Ingo Molnar
  2007-05-31 20:00       ` Siddha, Suresh B
  1 sibling, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2007-05-31 13:54 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Siddha, Suresh B, ak, linux-kernel, nanhai.zou,
	asit.k.mallick, keith.packard, Yinghai Lu


* Eric W. Biederman <ebiederm@xmission.com> wrote:

> Initial analysis and patch from Nanhai.
> 
> Clean up patch from Suresh.
> 
> Rewritten to be less intrusive, and to contain a big fat comment by 
> Eric.

thanks Erik!

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3)
  2007-05-31 13:50     ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3) Eric W. Biederman
  2007-05-31 13:54       ` Ingo Molnar
@ 2007-05-31 20:00       ` Siddha, Suresh B
  1 sibling, 0 replies; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-31 20:00 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Ingo Molnar, Siddha, Suresh B, ak, linux-kernel,
	nanhai.zou, asit.k.mallick, keith.packard, Yinghai Lu

On Thu, May 31, 2007 at 07:50:58AM -0600, Eric W. Biederman wrote:
> 
> On x86_64 kernel, level triggered irq migration gets initiated in the context
> of that interrupt(after executing the irq handler) and following steps are
> followed to do the irq migration.
> 
> 1. mask IOAPIC RTE entry;     // write to IOAPIC RTE
> 2. EOI;                       // processor EOI write
> 3. reprogram IOAPIC RTE entry // write to IOAPIC RTE with new destination and
>                               // and interrupt vector due to per cpu vector
>                               // allocation.
> 4. unmask IOAPIC RTE entry;   // write to IOAPIC RTE
> 
> Because of the per cpu vector allocation in x86_64 kernels, when the irq
> migrates to a different cpu, new vector(corresponding to the new cpu) will
> get allocated.
> 
> An EOI write to local APIC has a side effect of generating an EOI write
> for level trigger interrupts (normally this is a broadcast to all IOAPICs).
> The EOI broadcast generated as a side effect of EOI write to processor may
> be delayed while the other IOAPIC writes (step 3 and 4) can go through.
> 
> Normally, the EOI generated by local APIC for level trigger interrupt
> contains vector number. The IOAPIC will take this vector number and
> search the IOAPIC RTE entries for an entry with matching vector number and
> clear the remote IRR bit (indicate EOI). However, if the vector number is
> changed (as in step 3) the IOAPIC will not find the RTE entry when the EOI
> is received later. This will cause the remote IRR to get stuck causing the
> interrupt hang (no more interrupt from this RTE).
> 
> Current x86_64 kernel assumes that remote IRR bit is cleared by the time
> IOAPIC RTE is reprogrammed. Fix this assumption by checking for remote IRR
> bit and if it still set, delay the irq migration to the next interrupt
> arrival event(hopefully, next time remote IRR bit will get cleared
> before the IOAPIC RTE is reprogrammed).
> 
> Initial analysis and patch from Nanhai.
> 
> Clean up patch from Suresh.
> 
> Rewritten to be less intrusive, and to contain a big fat comment by Eric.

Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>

Thanks Eric.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2)
  2007-05-31 13:29     ` Eric W. Biederman
@ 2007-05-31 20:02       ` Siddha, Suresh B
  0 siblings, 0 replies; 26+ messages in thread
From: Siddha, Suresh B @ 2007-05-31 20:02 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andi Kleen, Andrew Morton, Siddha, Suresh B, mingo, linux-kernel,
	nanhai.zou, asit.k.mallick, keith.packard, Yinghai Lu

On Thu, May 31, 2007 at 07:29:43AM -0600, Eric W. Biederman wrote:
> Andi Kleen <ak@suse.de> writes:
> 
> > On Thursday 31 May 2007 13:34:21 Eric W. Biederman wrote:
> >> 
> >> On x86_64 kernel, level triggered irq migration gets initiated in the context
> >> of that interrupt(after executing the irq handler) and following steps are
> >> followed to do the irq migration.
> >
> > What's the confidence level in the patch? Is it a .22 candidate?
> 
> Yes.
> 
> I just ran it for a little while and nothing fell over, and level
> triggered irqs continued to migrate.  So it's not horribly broken.
> 
> Outside of implementation errors the worst case that can happen with
> this code is the current behavior, with a small slow down.
> 
> So since I can not reproduce the original problem I would appreciate
> an eyeball or two more just to make certain I reimplemented this
> correctly and didn't do anything stupid.   But I don't expect anything
> to be found.  The code is pretty straight forward.

Yes. We want this to get included in .22. We did extensive testing with
our patch before.  We will also do more tests with this version, however.

thanks,
suresh

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-05-31 20:05 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-05-17 23:03 [patch] x86_64, irq: check remote IRR bit before migrating level triggered irq Siddha, Suresh B
2007-05-17 23:58 ` Eric W. Biederman
2007-05-18  0:43   ` Siddha, Suresh B
2007-05-18  0:30 ` Eric W. Biederman
2007-05-18  1:01   ` Siddha, Suresh B
2007-05-18 14:40     ` Eric W. Biederman
2007-05-18 17:30       ` Yinghai Lu
2007-05-18 18:01         ` Eric W. Biederman
2007-05-18 18:09           ` Yinghai Lu
2007-05-18 18:20             ` Eric W. Biederman
2007-05-18 18:07       ` Siddha, Suresh B
2007-05-18 18:28         ` Yinghai Lu
2007-05-18 18:39           ` Siddha, Suresh B
2007-05-18 19:02             ` Eric W. Biederman
2007-05-18 19:18               ` Siddha, Suresh B
2007-05-31  2:50         ` Eric W. Biederman
2007-05-18 12:00 ` Andi Kleen
2007-05-18 14:19   ` Eric W. Biederman
2007-05-31 11:34 ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v2) Eric W. Biederman
2007-05-31 12:01   ` Andi Kleen
2007-05-31 13:29     ` Eric W. Biederman
2007-05-31 20:02       ` Siddha, Suresh B
2007-05-31 13:34   ` Ingo Molnar
2007-05-31 13:50     ` [PATCH] x86_64 irq: check remote IRR bit before migrating level triggered irq (v3) Eric W. Biederman
2007-05-31 13:54       ` Ingo Molnar
2007-05-31 20:00       ` Siddha, Suresh B

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).