* [patch 1/3] x86/msi: Make irq_retrigger() functional for posted MSI
2025-11-25 10:20 [patch 0/3] x86/irq: Bugfix and cleanup for posted MSI interrupts Thomas Gleixner
@ 2025-11-25 10:20 ` Thomas Gleixner
2025-11-25 17:54 ` Luigi Rizzo
2025-11-25 10:20 ` [patch 2/3] x86/irq: Cleanup posted MSI code Thomas Gleixner
2025-11-25 10:20 ` [patch 3/3] x86/irq_remapping: Sanitize posted_msi_supported() Thomas Gleixner
2 siblings, 1 reply; 6+ messages in thread
From: Thomas Gleixner @ 2025-11-25 10:20 UTC (permalink / raw)
To: LKML; +Cc: x86, Luigi Rizzo, stable, Lu Baolu, Joerg Roedel
Luigi reported that retriggering a posted MSI interrupt does not work
correctly.
The reason is that the retrigger happens at the vector domain by sending an
IPI to the actual vector on the target CPU. That works correctly exactly
once because the posted MSI interrupt chip does not issue an EOI as that's
only required for the posted MSI notification vector itself.
As a consequence the vector becomes stale in the ISR, which not only
affects this vector but also any lower priority vector in the affected
APIC because the ISR bit is not cleared.
Luigi proposed to set the vector in the remap PIR bitmap and raise the
posted MSI notification vector. That works, but that still does not cure a
related problem:
If there is ever a stray interrupt on such a vector, then the related
APIC ISR bit becomes stale due to the lack of EOI as described above.
Unlikely to happen, but if it happens it's not debuggable at all.
So instead of playing games with the PIR, this can be actually solved
for both cases by:
1) Keeping track of the posted interrupt vector handler state
2) Implementing a posted MSI specific irq_ack() callback which checks that
state. If the posted vector handler is inactive it issues an EOI,
otherwise it delegates that to the posted handler.
This is correct versus affinity changes and concurrent events on the posted
vector as the actual handler invocation is serialized through the interrupt
descriptor lock.
Fixes: ed1e48ea4370 ("iommu/vt-d: Enable posted mode for device MSIs")
Reported-by: Luigi Rizzo <lrizzo@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Luigi Rizzo <lrizzo@google.com>
Cc: stable@vger.kernel.org
Closes: https://lore.kernel.org/lkml/20251124104836.3685533-1-lrizzo@google.com
---
arch/x86/include/asm/irq_remapping.h | 7 +++++++
arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++
drivers/iommu/intel/irq_remapping.c | 8 ++++----
3 files changed, 34 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -87,4 +87,11 @@ static inline void panic_if_irq_remap(co
}
#endif /* CONFIG_IRQ_REMAP */
+
+#ifdef CONFIG_X86_POSTED_MSI
+void intel_ack_posted_msi_irq(struct irq_data *irqd);
+#else
+#define intel_ack_posted_msi_irq NULL
+#endif
+
#endif /* __X86_IRQ_REMAPPING_H */
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -397,6 +397,7 @@ DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm
/* Posted Interrupt Descriptors for coalesced MSIs to be posted */
DEFINE_PER_CPU_ALIGNED(struct pi_desc, posted_msi_pi_desc);
+static DEFINE_PER_CPU_CACHE_HOT(bool, posted_msi_handler_active);
void intel_posted_msi_init(void)
{
@@ -414,6 +415,25 @@ void intel_posted_msi_init(void)
this_cpu_write(posted_msi_pi_desc.ndst, destination);
}
+void intel_ack_posted_msi_irq(struct irq_data *irqd)
+{
+ irq_move_irq(irqd);
+
+ /*
+ * Handle the rare case that irq_retrigger() raised the actual
+ * assigned vector on the target CPU, which means that it was not
+ * invoked via the posted MSI handler below. In that case APIC EOI
+ * is required as otherwise the ISR entry becomes stale and lower
+ * priority interrupts are never going to be delivered after that.
+ *
+ * If the posted handler invoked the device interrupt handler then
+ * the EOI would be premature because it would acknowledge the
+ * posted vector.
+ */
+ if (unlikely(!this_cpu_read(posted_msi_handler_active)))
+ apic_eoi();
+}
+
static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
{
unsigned long pir_copy[NR_PIR_WORDS];
@@ -446,6 +466,8 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi
pid = this_cpu_ptr(&posted_msi_pi_desc);
+ /* Mark the handler active for intel_ack_posted_msi_irq() */
+ this_cpu_write(posted_msi_handler_active, true);
inc_irq_stat(posted_msi_notification_count);
irq_enter();
@@ -474,6 +496,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi
apic_eoi();
irq_exit();
+ this_cpu_write(posted_msi_handler_active, false);
set_irq_regs(old_regs);
}
#endif /* X86_POSTED_MSI */
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1303,17 +1303,17 @@ static struct irq_chip intel_ir_chip = {
* irq_enter();
* handle_edge_irq()
* irq_chip_ack_parent()
- * irq_move_irq(); // No EOI
+ * intel_ack_posted_msi_irq(); // No EOI
* handle_irq_event()
* driver_handler()
* handle_edge_irq()
* irq_chip_ack_parent()
- * irq_move_irq(); // No EOI
+ * intel_ack_posted_msi_irq(); // No EOI
* handle_irq_event()
* driver_handler()
* handle_edge_irq()
* irq_chip_ack_parent()
- * irq_move_irq(); // No EOI
+ * intel_ack_posted_msi_irq(); // No EOI
* handle_irq_event()
* driver_handler()
* apic_eoi()
@@ -1322,7 +1322,7 @@ static struct irq_chip intel_ir_chip = {
*/
static struct irq_chip intel_ir_chip_post_msi = {
.name = "INTEL-IR-POST",
- .irq_ack = irq_move_irq,
+ .irq_ack = intel_ack_posted_msi_irq,
.irq_set_affinity = intel_ir_set_affinity,
.irq_compose_msi_msg = intel_ir_compose_msi_msg,
.irq_set_vcpu_affinity = intel_ir_set_vcpu_affinity,
^ permalink raw reply [flat|nested] 6+ messages in thread* [patch 2/3] x86/irq: Cleanup posted MSI code
2025-11-25 10:20 [patch 0/3] x86/irq: Bugfix and cleanup for posted MSI interrupts Thomas Gleixner
2025-11-25 10:20 ` [patch 1/3] x86/msi: Make irq_retrigger() functional for posted MSI Thomas Gleixner
@ 2025-11-25 10:20 ` Thomas Gleixner
2025-11-25 10:20 ` [patch 3/3] x86/irq_remapping: Sanitize posted_msi_supported() Thomas Gleixner
2 siblings, 0 replies; 6+ messages in thread
From: Thomas Gleixner @ 2025-11-25 10:20 UTC (permalink / raw)
To: LKML; +Cc: x86, Luigi Rizzo, Lu Baolu, Joerg Roedel
Make code and comments readable.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
arch/x86/kernel/irq.c | 31 +++++++++++++------------------
1 file changed, 13 insertions(+), 18 deletions(-)
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -401,11 +401,9 @@ static DEFINE_PER_CPU_CACHE_HOT(bool, po
void intel_posted_msi_init(void)
{
- u32 destination;
- u32 apic_id;
+ u32 destination, apic_id;
this_cpu_write(posted_msi_pi_desc.nv, POSTED_MSI_NOTIFICATION_VECTOR);
-
/*
* APIC destination ID is stored in bit 8:15 while in XAPIC mode.
* VT-d spec. CH 9.11
@@ -449,8 +447,8 @@ static __always_inline bool handle_pendi
}
/*
- * Performance data shows that 3 is good enough to harvest 90+% of the benefit
- * on high IRQ rate workload.
+ * Performance data shows that 3 is good enough to harvest 90+% of the
+ * benefit on high interrupt rate workloads.
*/
#define MAX_POSTED_MSI_COALESCING_LOOP 3
@@ -460,11 +458,8 @@ static __always_inline bool handle_pendi
*/
DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
{
+ struct pi_desc *pid = this_cpu_ptr(&posted_msi_pi_desc);
struct pt_regs *old_regs = set_irq_regs(regs);
- struct pi_desc *pid;
- int i = 0;
-
- pid = this_cpu_ptr(&posted_msi_pi_desc);
/* Mark the handler active for intel_ack_posted_msi_irq() */
this_cpu_write(posted_msi_handler_active, true);
@@ -472,25 +467,25 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi
irq_enter();
/*
- * Max coalescing count includes the extra round of handle_pending_pir
- * after clearing the outstanding notification bit. Hence, at most
- * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
+ * Loop only MAX_POSTED_MSI_COALESCING_LOOP - 1 times here to take
+ * the final handle_pending_pir() invocation after clearing the
+ * outstanding notification bit into account.
*/
- while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
+ for (int i = 1; i < MAX_POSTED_MSI_COALESCING_LOOP; i++) {
if (!handle_pending_pir(pid->pir, regs))
break;
}
/*
- * Clear outstanding notification bit to allow new IRQ notifications,
- * do this last to maximize the window of interrupt coalescing.
+ * Clear the outstanding notification bit to rearm the notification
+ * mechanism.
*/
pi_clear_on(pid);
/*
- * There could be a race of PI notification and the clearing of ON bit,
- * process PIR bits one last time such that handling the new interrupts
- * are not delayed until the next IRQ.
+ * Clearing the ON bit can race with a notification. Process the
+ * PIR bits one last time so that handling the new interrupts is
+ * not delayed until the next notification happens.
*/
handle_pending_pir(pid->pir, regs);
^ permalink raw reply [flat|nested] 6+ messages in thread* [patch 3/3] x86/irq_remapping: Sanitize posted_msi_supported()
2025-11-25 10:20 [patch 0/3] x86/irq: Bugfix and cleanup for posted MSI interrupts Thomas Gleixner
2025-11-25 10:20 ` [patch 1/3] x86/msi: Make irq_retrigger() functional for posted MSI Thomas Gleixner
2025-11-25 10:20 ` [patch 2/3] x86/irq: Cleanup posted MSI code Thomas Gleixner
@ 2025-11-25 10:20 ` Thomas Gleixner
2 siblings, 0 replies; 6+ messages in thread
From: Thomas Gleixner @ 2025-11-25 10:20 UTC (permalink / raw)
To: LKML; +Cc: x86, Lu Baolu, Joerg Roedel, Luigi Rizzo
posted_msi_supported() is a misnomer as it actually checks whether it is
enabled or not. Aside of that this does not take CONFIG_X86_POSTED_MSI into
account which is required to actually use it.
Rename it to posted_msi_enabled() and make the return value depend on
CONFIG_X86_POSTED_MSI, which allows the compiler to eliminate the related
dead code and data if disabled:
text data bss dec hex filename
10046 701 3296 14043 36db drivers/iommu/intel/irq_remapping.o
9904 413 3296 13613 352d drivers/iommu/intel/irq_remapping.o
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
---
arch/x86/include/asm/irq_remapping.h | 5 +++--
drivers/iommu/intel/irq_remapping.c | 4 ++--
2 files changed, 5 insertions(+), 4 deletions(-)
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -67,9 +67,10 @@ static inline struct irq_domain *arch_ge
extern bool enable_posted_msi;
-static inline bool posted_msi_supported(void)
+static inline bool posted_msi_enabled(void)
{
- return enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
+ return IS_ENABLED(CONFIG_X86_POSTED_MSI) &&
+ enable_posted_msi && irq_remapping_cap(IRQ_POSTING_CAP);
}
#else /* CONFIG_IRQ_REMAP */
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -1368,7 +1368,7 @@ static void intel_irq_remapping_prepare_
break;
case X86_IRQ_ALLOC_TYPE_PCI_MSI:
case X86_IRQ_ALLOC_TYPE_PCI_MSIX:
- if (posted_msi_supported()) {
+ if (posted_msi_enabled()) {
prepare_irte_posted(irte);
data->irq_2_iommu.posted_msi = 1;
}
@@ -1460,7 +1460,7 @@ static int intel_irq_remapping_alloc(str
irq_data->hwirq = (index << 16) + i;
irq_data->chip_data = ird;
- if (posted_msi_supported() &&
+ if (posted_msi_enabled() &&
((info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) ||
(info->type == X86_IRQ_ALLOC_TYPE_PCI_MSIX)))
irq_data->chip = &intel_ir_chip_post_msi;
^ permalink raw reply [flat|nested] 6+ messages in thread