public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Gary Hade <garyhade@us.ibm.com>
To: mingo@elte.hu, mingo@redhat.com, tglx@linutronix.de,
	hpa@zytor.com, x86@kernel.org
Cc: linux-kernel@vger.kernel.org, garyhade@us.ibm.com, lcm@us.ibm.com
Subject: [PATCH 3/3] [BUGFIX] x86/x86_64: fix IRQ migration triggered active device IRQ interrruption
Date: Wed, 8 Apr 2009 14:07:45 -0700	[thread overview]
Message-ID: <20090408210745.GE11159@us.ibm.com> (raw)

Impact: Eliminates an issue that can leave the system in an
        unusable state.

This patch addresses an issue where device generated IRQs
are no longer seen by the kernel following IRQ affinity
migration while the device is generating IRQs at a high rate.
We have seen this problem happen when IRQ affinities are
adjusted in response to CPU offlining but I believe it
could also happen in during user initiated IRQ affinity
changes unrelated to CPU offlining. e.g. while the 
irqbalance daemon is adjusting IRQ affinities when the
system is heavily loaded.

I have been able to consistently reproduce the problem on
some of our systems by running the following script (VICTIM_IRQ
specifies the IRQ for the aic94xx device) while a single instance
of the command
  # while true; do find / -exec file {} \;; done
is keeping the filesystem activity and IRQ rate reasonably high.

#!/bin/sh

SYS_CPU_DIR=/sys/devices/system/cpu
VICTIM_IRQ=25
IRQ_MASK=f0

iteration=0
while true; do
  echo $iteration
  echo $IRQ_MASK > /proc/irq/$VICTIM_IRQ/smp_affinity
  for cpudir in $SYS_CPU_DIR/cpu[1-9] $SYS_CPU_DIR/cpu??; do
    echo 0 > $cpudir/online
  done
  for cpudir in $SYS_CPU_DIR/cpu[1-9] $SYS_CPU_DIR/cpu??; do
    echo 1 > $cpudir/online
  done
  iteration=`expr $iteration + 1`
done

The root cause is a known issue already addressed for some
code paths [e.g. ack_apic_level() and the now obsolete
migrate_irq_remapped_level_desc()] where the ioapic can
misbehave when the I/O redirection table register is written
while the Remote IRR bit is set.

The proposed fix uses the same avoidance method and much 
of same code that the Interrupt Remapping code previously
used to avoid the same problem.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>

---
 arch/x86/kernel/apic/io_apic.c |   72 ++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

Index: linux-2.6.30-rc1/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6.30-rc1.orig/arch/x86/kernel/apic/io_apic.c	2009-04-08 09:24:11.000000000 -0700
+++ linux-2.6.30-rc1/arch/x86/kernel/apic/io_apic.c	2009-04-08 09:24:23.000000000 -0700
@@ -2331,7 +2331,8 @@ set_desc_affinity(struct irq_desc *desc,
 }
 
 static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+set_ioapic_irq_affinity_desc(struct irq_desc *desc,
+			     const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
@@ -2352,6 +2353,75 @@ set_ioapic_affinity_irq_desc(struct irq_
 }
 
 static void
+delayed_irq_move(struct work_struct *work)
+{
+	unsigned int irq;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(irq, desc) {
+		if (desc->status & IRQ_MOVE_PENDING) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&desc->lock, flags);
+			if (!desc->chip->set_affinity ||
+			    !(desc->status & IRQ_MOVE_PENDING)) {
+				desc->status &= ~IRQ_MOVE_PENDING;
+				spin_unlock_irqrestore(&desc->lock, flags);
+				continue;
+			}
+
+			desc->chip->set_affinity(irq, desc->pending_mask);
+			spin_unlock_irqrestore(&desc->lock, flags);
+		}
+	}
+}
+
+static DECLARE_DELAYED_WORK(delayed_irq_move_work, delayed_irq_move);
+
+static void
+set_ioapic_irq_affinity_level_desc(struct irq_desc *desc)
+{
+
+	struct irq_cfg *cfg = desc->chip_data;
+
+	mask_IO_APIC_irq_desc(desc);
+
+	if (io_apic_level_ack_pending(cfg)) {
+		/*
+		 * Interrupt in progress. Migrating irq now will change
+		 * the vector information in the IO-APIC RTE which will
+		 * confuse the EOI broadcast performed by cpu.
+		 * So, we delay the irq migration.
+		 */
+		schedule_delayed_work(&delayed_irq_move_work, 1);
+		goto unmask;
+	}
+
+	/* Interrupt not in progress. we can change the vector
+	 * information in the IO-APIC RTE. */
+	set_ioapic_irq_affinity_desc(desc, desc->pending_mask);
+
+	desc->status &= ~IRQ_MOVE_PENDING;
+	cpumask_clear(desc->pending_mask);
+
+unmask:
+	unmask_IO_APIC_irq_desc(desc);
+}
+
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc,
+			     const struct cpumask *mask)
+{
+	if (desc->status & IRQ_LEVEL) {
+		desc->status |= IRQ_MOVE_PENDING;
+		cpumask_copy(desc->pending_mask, mask);
+		set_ioapic_irq_affinity_level_desc(desc);
+		return;
+	}
+	set_ioapic_irq_affinity_desc(desc, mask);
+}
+
+static void
 set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc;

             reply	other threads:[~2009-04-08 21:08 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-04-08 21:07 Gary Hade [this message]
2009-04-08 22:03 ` [PATCH 3/3] [BUGFIX] x86/x86_64: fix IRQ migration triggered active device IRQ interrruption Yinghai Lu
2009-04-08 23:08   ` Gary Hade
2009-04-11  6:46     ` Yinghai Lu
2009-04-13 19:37       ` Gary Hade
2009-04-13 20:17         ` Eric W. Biederman
2009-04-28  0:05           ` Gary Hade
2009-04-28 10:27             ` Eric W. Biederman
2009-04-29  0:44               ` Gary Hade
2009-04-29  1:44                 ` Eric W. Biederman
2009-04-29 17:17                   ` Gary Hade
2009-04-29 17:46                     ` Eric W. Biederman
2009-04-30 18:15                       ` Gary Hade
2009-04-30 21:17                         ` Gary Hade
2009-05-24  0:24                       ` Yinghai Lu
2009-04-10 21:39   ` Gary Hade
2009-04-11  7:35     ` Yinghai Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090408210745.GE11159@us.ibm.com \
    --to=garyhade@us.ibm.com \
    --cc=hpa@zytor.com \
    --cc=lcm@us.ibm.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=mingo@redhat.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox