linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain
@ 2025-06-26 14:47 Nam Cao
  2025-06-26 14:47 ` [PATCH 1/3] powerpc/xive: Untangle xive from child interrupt controller drivers Nam Cao
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Nam Cao @ 2025-06-26 14:47 UTC (permalink / raw)
  To: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel
  Cc: Nam Cao

The initial implementation of PCI/MSI interrupt domains in the hierarchical
interrupt domain model used a shortcut by providing a global PCI/MSI
domain.

This works because the PCI/MSI[X] hardware is standardized and uniform, but
it violates the basic design principle of hierarchical interrupt domains:
Each hardware block involved in the interrupt delivery chain should have a
separate interrupt domain.

For PCI/MSI[X], the interrupt controller is per PCI device and not a global
made-up entity.

Unsurprisingly, the shortcut turned out to have downsides as it does not
allow dynamic allocation of interrupt vectors after initialization and it
prevents supporting IMS on PCI. For further details, see:

https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de/

The solution is implementing per device MSI domains, this means the
entities which provide global PCI/MSI domain so far have to implement MSI
parent domain functionality instead.

This series:

   - Untangle XIVE driver from Powernv and Pseries drivers

   - Convert the Powernv and Pseries drivers to implement MSI parent domain
     functionality

Nam Cao (3):
  powerpc/xive: Untangle xive from child interrupt controller drivers
  powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain()
  powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()

 arch/powerpc/include/asm/pci-bridge.h     |   2 -
 arch/powerpc/include/asm/xive.h           |   1 -
 arch/powerpc/platforms/powernv/Kconfig    |   1 +
 arch/powerpc/platforms/powernv/pci-ioda.c |  96 ++++++-----------
 arch/powerpc/platforms/pseries/Kconfig    |   1 +
 arch/powerpc/platforms/pseries/msi.c      | 124 ++++++++--------------
 arch/powerpc/sysdev/xive/common.c         |  63 ++++++-----
 7 files changed, 113 insertions(+), 175 deletions(-)

-- 
2.39.5


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/3] powerpc/xive: Untangle xive from child interrupt controller drivers
  2025-06-26 14:47 [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Nam Cao
@ 2025-06-26 14:47 ` Nam Cao
  2025-06-26 14:47 ` [PATCH 2/3] powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain() Nam Cao
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Nam Cao @ 2025-06-26 14:47 UTC (permalink / raw)
  To: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel
  Cc: Nam Cao

xive-specific data is stored in handler_data. This creates a mess, as xive
has to rely on child interrupt controller drivers to clean up this data, as
was done by 9a014f45688 ("powerpc/pseries/pci: Add a msi_free() handler to
clear XIVE data").

Instead, store xive-specific data in chip_data and untangle the child
drivers.

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 arch/powerpc/include/asm/xive.h           |  1 -
 arch/powerpc/platforms/powernv/pci-ioda.c | 21 +-------
 arch/powerpc/platforms/pseries/msi.c      | 18 +------
 arch/powerpc/sysdev/xive/common.c         | 63 +++++++++++------------
 4 files changed, 33 insertions(+), 70 deletions(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 92930b0b5d0e1..efb0f5effcc69 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,7 +111,6 @@ void xive_native_free_vp_block(u32 vp_base);
 int xive_native_populate_irq_data(u32 hw_irq,
 				  struct xive_irq_data *data);
 void xive_cleanup_irq_data(struct xive_irq_data *xd);
-void xive_irq_free_data(unsigned int virq);
 void xive_native_free_irq(u32 irq);
 int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
 
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index d8ccf2c9b98ad..fb37098d4d58c 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -37,7 +37,6 @@
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
 #include <asm/mmzone.h>
-#include <asm/xive.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -1707,23 +1706,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
 	return 0;
 }
 
-/*
- * The msi_free() op is called before irq_domain_free_irqs_top() when
- * the handler data is still available. Use that to clear the XIVE
- * controller.
- */
-static void pnv_msi_ops_msi_free(struct irq_domain *domain,
-				 struct msi_domain_info *info,
-				 unsigned int irq)
-{
-	if (xive_enabled())
-		xive_irq_free_data(irq);
-}
-
-static struct msi_domain_ops pnv_pci_msi_domain_ops = {
-	.msi_free	= pnv_msi_ops_msi_free,
-};
-
 static void pnv_msi_shutdown(struct irq_data *d)
 {
 	d = d->parent_data;
@@ -1754,7 +1736,6 @@ static struct irq_chip pnv_pci_msi_irq_chip = {
 static struct msi_domain_info pnv_msi_domain_info = {
 	.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
 		  MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX),
-	.ops   = &pnv_pci_msi_domain_ops,
 	.chip  = &pnv_pci_msi_irq_chip,
 };
 
@@ -1870,7 +1851,7 @@ static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
 		 virq, d->hwirq, nr_irqs);
 
 	msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs);
-	/* XIVE domain is cleared through ->msi_free() */
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
 
 static const struct irq_domain_ops pnv_irq_domain_ops = {
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index ee1c8c6898a3c..10712477938e4 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -15,7 +15,6 @@
 #include <asm/hw_irq.h>
 #include <asm/ppc-pci.h>
 #include <asm/machdep.h>
-#include <asm/xive.h>
 
 #include "pseries.h"
 
@@ -436,19 +435,6 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
 	return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
 
-/*
- * ->msi_free() is called before irq_domain_free_irqs_top() when the
- * handler data is still available. Use that to clear the XIVE
- * controller data.
- */
-static void pseries_msi_ops_msi_free(struct irq_domain *domain,
-				     struct msi_domain_info *info,
-				     unsigned int irq)
-{
-	if (xive_enabled())
-		xive_irq_free_data(irq);
-}
-
 /*
  * RTAS can not disable one MSI at a time. It's all or nothing. Do it
  * at the end after all IRQs have been freed.
@@ -463,7 +449,6 @@ static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev)
 
 static struct msi_domain_ops pseries_pci_msi_domain_ops = {
 	.msi_prepare	= pseries_msi_ops_prepare,
-	.msi_free	= pseries_msi_ops_msi_free,
 	.msi_post_free	= pseries_msi_post_free,
 };
 
@@ -604,8 +589,7 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
 	struct pci_controller *phb = irq_data_get_irq_chip_data(d);
 
 	pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs);
-
-	/* XIVE domain data is cleared through ->msi_free() */
+	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
 
 static const struct irq_domain_ops pseries_irq_domain_ops = {
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index f105924050247..625361a15424e 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -317,7 +317,7 @@ int xmon_xive_get_irq_config(u32 hw_irq, struct irq_data *d)
 	if (d) {
 		char buffer[128];
 
-		xive_irq_data_dump(irq_data_get_irq_handler_data(d),
+		xive_irq_data_dump(irq_data_get_irq_chip_data(d),
 				   buffer, sizeof(buffer));
 		xmon_printf("%s", buffer);
 	}
@@ -437,7 +437,7 @@ static void xive_do_source_eoi(struct xive_irq_data *xd)
 /* irq_chip eoi callback, called with irq descriptor lock held */
 static void xive_irq_eoi(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
 
 	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
@@ -595,7 +595,7 @@ static int xive_pick_irq_target(struct irq_data *d,
 				const struct cpumask *affinity)
 {
 	static unsigned int fuzz;
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	cpumask_var_t mask;
 	int cpu = -1;
 
@@ -628,7 +628,7 @@ static int xive_pick_irq_target(struct irq_data *d,
 
 static unsigned int xive_irq_startup(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 	int target, rc;
 
@@ -673,7 +673,7 @@ static unsigned int xive_irq_startup(struct irq_data *d)
 /* called with irq descriptor lock held */
 static void xive_irq_shutdown(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 
 	pr_debug("%s: irq %d [0x%x] data @%p\n", __func__, d->irq, hw_irq, d);
@@ -698,7 +698,7 @@ static void xive_irq_shutdown(struct irq_data *d)
 
 static void xive_irq_unmask(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd);
 
@@ -707,7 +707,7 @@ static void xive_irq_unmask(struct irq_data *d)
 
 static void xive_irq_mask(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	pr_debug("%s: irq %d data @%p\n", __func__, d->irq, xd);
 
@@ -718,7 +718,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
 				 const struct cpumask *cpumask,
 				 bool force)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 	u32 target, old_target;
 	int rc = 0;
@@ -776,7 +776,7 @@ static int xive_irq_set_affinity(struct irq_data *d,
 
 static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	/*
 	 * We only support these. This has really no effect other than setting
@@ -815,7 +815,7 @@ static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
 
 static int xive_irq_retrigger(struct irq_data *d)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 
 	/* This should be only for MSIs */
 	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
@@ -837,7 +837,7 @@ static int xive_irq_retrigger(struct irq_data *d)
  */
 static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(d);
 	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
 	int rc;
 	u8 pq;
@@ -951,7 +951,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
 static int xive_get_irqchip_state(struct irq_data *data,
 				  enum irqchip_irq_state which, bool *state)
 {
-	struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
+	struct xive_irq_data *xd = irq_data_get_irq_chip_data(data);
 	u8 pq;
 
 	switch (which) {
@@ -1011,21 +1011,20 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
 }
 EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
 
-static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
+static struct xive_irq_data *xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 {
 	struct xive_irq_data *xd;
 	int rc;
 
 	xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
 	if (!xd)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	rc = xive_ops->populate_irq_data(hw, xd);
 	if (rc) {
 		kfree(xd);
-		return rc;
+		return ERR_PTR(rc);
 	}
 	xd->target = XIVE_INVALID_TARGET;
-	irq_set_handler_data(virq, xd);
 
 	/*
 	 * Turn OFF by default the interrupt being mapped. A side
@@ -1036,20 +1035,19 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
 	 */
 	xive_esb_read(xd, XIVE_ESB_SET_PQ_01);
 
-	return 0;
+	return xd;
 }
 
-void xive_irq_free_data(unsigned int virq)
+static void xive_irq_free_data(unsigned int virq)
 {
-	struct xive_irq_data *xd = irq_get_handler_data(virq);
+	struct xive_irq_data *xd = irq_get_chip_data(virq);
 
 	if (!xd)
 		return;
-	irq_set_handler_data(virq, NULL);
+	irq_set_chip_data(virq, NULL);
 	xive_cleanup_irq_data(xd);
 	kfree(xd);
 }
-EXPORT_SYMBOL_GPL(xive_irq_free_data);
 
 #ifdef CONFIG_SMP
 
@@ -1286,7 +1284,7 @@ void __init xive_smp_probe(void)
 static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
 			       irq_hw_number_t hw)
 {
-	int rc;
+	struct xive_irq_data *xd;
 
 	/*
 	 * Mark interrupts as edge sensitive by default so that resend
@@ -1294,11 +1292,12 @@ static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
 	 */
 	irq_clear_status_flags(virq, IRQ_LEVEL);
 
-	rc = xive_irq_alloc_data(virq, hw);
-	if (rc)
-		return rc;
+	xd = xive_irq_alloc_data(virq, hw);
+	if (IS_ERR(xd))
+		return PTR_ERR(xd);
 
 	irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
+	irq_set_chip_data(virq, xd);
 
 	return 0;
 }
@@ -1366,7 +1365,7 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d,
 	seq_printf(m, "%*sXIVE:\n", ind, "");
 	ind++;
 
-	xd = irq_data_get_irq_handler_data(irqd);
+	xd = irq_data_get_irq_chip_data(irqd);
 	if (!xd) {
 		seq_printf(m, "%*snot assigned\n", ind, "");
 		return;
@@ -1403,6 +1402,7 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 				 unsigned int nr_irqs, void *arg)
 {
 	struct irq_fwspec *fwspec = arg;
+	struct xive_irq_data *xd;
 	irq_hw_number_t hwirq;
 	unsigned int type = IRQ_TYPE_NONE;
 	int i, rc;
@@ -1423,12 +1423,11 @@ static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 		irq_clear_status_flags(virq, IRQ_LEVEL);
 
 		/* allocates and sets handler data */
-		rc = xive_irq_alloc_data(virq + i, hwirq + i);
-		if (rc)
-			return rc;
+		xd = xive_irq_alloc_data(virq + i, hwirq + i);
+		if (IS_ERR(xd))
+			return PTR_ERR(xd);
 
-		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
-					      &xive_irq_chip, domain->host_data);
+		irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, &xive_irq_chip, xd);
 		irq_set_handler(virq + i, handle_fasteoi_irq);
 	}
 
@@ -1764,7 +1763,7 @@ static void xive_debug_show_irq(struct seq_file *m, struct irq_data *d)
 	seq_printf(m, "IRQ 0x%08x : target=0x%x prio=%02x lirq=0x%x ",
 		   hw_irq, target, prio, lirq);
 
-	xive_irq_data_dump(irq_data_get_irq_handler_data(d), buffer, sizeof(buffer));
+	xive_irq_data_dump(irq_data_get_irq_chip_data(d), buffer, sizeof(buffer));
 	seq_puts(m, buffer);
 	seq_puts(m, "\n");
 }
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 2/3] powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain()
  2025-06-26 14:47 [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Nam Cao
  2025-06-26 14:47 ` [PATCH 1/3] powerpc/xive: Untangle xive from child interrupt controller drivers Nam Cao
@ 2025-06-26 14:47 ` Nam Cao
  2025-06-26 14:47 ` [PATCH 3/3] powerpc/pseries/msi: Switch to msi_create_parent_irq_domain() Nam Cao
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Nam Cao @ 2025-06-26 14:47 UTC (permalink / raw)
  To: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel
  Cc: Nam Cao

Move away from the legacy MSI domain setup, switch to use
msi_create_parent_irq_domain().

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 arch/powerpc/platforms/powernv/Kconfig    |  1 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 75 ++++++++++-------------
 2 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 95d7ba73d43d0..b5ad7c173ef0c 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -9,6 +9,7 @@ config PPC_POWERNV
 	select PPC_P7_NAP
 	select FORCE_PCI
 	select PCI_MSI
+	select IRQ_MSI_LIB
 	select EPAPR_BOOT
 	select PPC_INDIRECT_PIO
 	select PPC_UDBG_16550
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index fb37098d4d58c..c2a932c6e3040 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/memblock.h>
 #include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
 #include <linux/io.h>
 #include <linux/msi.h>
 #include <linux/iommu.h>
@@ -1713,30 +1714,33 @@ static void pnv_msi_shutdown(struct irq_data *d)
 		d->chip->irq_shutdown(d);
 }
 
-static void pnv_msi_mask(struct irq_data *d)
+static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				  struct irq_domain *real_parent, struct msi_domain_info *info)
 {
-	pci_msi_mask_irq(d);
-	irq_chip_mask_parent(d);
-}
+	struct irq_chip *chip = info->chip;
 
-static void pnv_msi_unmask(struct irq_data *d)
-{
-	pci_msi_unmask_irq(d);
-	irq_chip_unmask_parent(d);
-}
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
 
-static struct irq_chip pnv_pci_msi_irq_chip = {
-	.name		= "PNV-PCI-MSI",
-	.irq_shutdown	= pnv_msi_shutdown,
-	.irq_mask	= pnv_msi_mask,
-	.irq_unmask	= pnv_msi_unmask,
-	.irq_eoi	= irq_chip_eoi_parent,
-};
+	chip->irq_shutdown = pnv_msi_shutdown;
+	return true;
+}
 
-static struct msi_domain_info pnv_msi_domain_info = {
-	.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		  MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX),
-	.chip  = &pnv_pci_msi_irq_chip,
+#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS		| \
+				    MSI_FLAG_USE_DEF_CHIP_OPS		| \
+				    MSI_FLAG_PCI_MSI_MASK_PARENT)
+#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK		| \
+				     MSI_FLAG_PCI_MSIX			| \
+				     MSI_FLAG_MULTI_PCI_MSI)
+
+static const struct msi_parent_ops pnv_msi_parent_ops = {
+	.required_flags		= PNV_PCI_MSI_FLAGS_REQUIRED,
+	.supported_flags	= PNV_PCI_MSI_FLAGS_SUPPORTED,
+	.chip_flags		= MSI_CHIP_FLAG_SET_EOI,
+	.bus_select_token	= DOMAIN_BUS_NEXUS,
+	.bus_select_mask	= MATCH_PCI_MSI,
+	.prefix			= "PNV-",
+	.init_dev_msi_info	= pnv_init_dev_msi_info,
 };
 
 static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
@@ -1855,37 +1859,26 @@ static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
 }
 
 static const struct irq_domain_ops pnv_irq_domain_ops = {
+	.select	= msi_lib_irq_domain_select,
 	.alloc  = pnv_irq_domain_alloc,
 	.free   = pnv_irq_domain_free,
 };
 
 static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count)
 {
-	struct pnv_phb *phb = hose->private_data;
 	struct irq_domain *parent = irq_get_default_domain();
-
-	hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id);
-	if (!hose->fwnode)
-		return -ENOMEM;
-
-	hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
-						       hose->fwnode,
-						       &pnv_irq_domain_ops, hose);
+	struct irq_domain_info info = {
+		.fwnode		= of_fwnode_handle(hose->dn),
+		.ops		= &pnv_irq_domain_ops,
+		.host_data	= hose,
+		.size		= count,
+		.parent		= parent,
+	};
+
+	hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops);
 	if (!hose->dev_domain) {
-		pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
-		       hose->dn, hose->global_number);
-		irq_domain_free_fwnode(hose->fwnode);
-		return -ENOMEM;
-	}
-
-	hose->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(hose->dn),
-						     &pnv_msi_domain_info,
-						     hose->dev_domain);
-	if (!hose->msi_domain) {
 		pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
 		       hose->dn, hose->global_number);
-		irq_domain_free_fwnode(hose->fwnode);
-		irq_domain_remove(hose->dev_domain);
 		return -ENOMEM;
 	}
 
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH 3/3] powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()
  2025-06-26 14:47 [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Nam Cao
  2025-06-26 14:47 ` [PATCH 1/3] powerpc/xive: Untangle xive from child interrupt controller drivers Nam Cao
  2025-06-26 14:47 ` [PATCH 2/3] powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain() Nam Cao
@ 2025-06-26 14:47 ` Nam Cao
  2025-07-03 16:37 ` [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Thomas Gleixner
  2025-07-22  8:35 ` Gautam Menghani
  4 siblings, 0 replies; 9+ messages in thread
From: Nam Cao @ 2025-06-26 14:47 UTC (permalink / raw)
  To: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel
  Cc: Nam Cao

Move away from the legacy MSI domain setup, switch to use
msi_create_parent_irq_domain().

Signed-off-by: Nam Cao <namcao@linutronix.de>
---
 arch/powerpc/include/asm/pci-bridge.h  |   2 -
 arch/powerpc/platforms/pseries/Kconfig |   1 +
 arch/powerpc/platforms/pseries/msi.c   | 106 ++++++++++---------------
 3 files changed, 45 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 2aa3a091ef20e..1dae53130782a 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -133,8 +133,6 @@ struct pci_controller {
 
 	/* IRQ domain hierarchy */
 	struct irq_domain	*dev_domain;
-	struct irq_domain	*msi_domain;
-	struct fwnode_handle	*fwnode;
 
 	/* iommu_ops support */
 	struct iommu_device	iommu;
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index fa3c2fff082a8..3e042218d6cd8 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -7,6 +7,7 @@ config PPC_PSERIES
 	select OF_DYNAMIC
 	select FORCE_PCI
 	select PCI_MSI
+	select IRQ_MSI_LIB
 	select GENERIC_ALLOCATOR
 	select PPC_XICS
 	select PPC_XIVE_SPAPR
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 10712477938e4..70be6e24427da 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/device.h>
 #include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
 #include <linux/seq_file.h>
@@ -429,8 +430,9 @@ static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type,
 static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev,
 				   int nvec, msi_alloc_info_t *arg)
 {
+	struct msi_domain_info *info = domain->host_data;
 	struct pci_dev *pdev = to_pci_dev(dev);
-	int type = pdev->msix_enabled ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
+	int type = (info->flags & MSI_FLAG_PCI_MSIX) ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI;
 
 	return rtas_prepare_msi_irqs(pdev, nvec, type, arg);
 }
@@ -447,11 +449,6 @@ static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev)
 	rtas_disable_msi(to_pci_dev(dev));
 }
 
-static struct msi_domain_ops pseries_pci_msi_domain_ops = {
-	.msi_prepare	= pseries_msi_ops_prepare,
-	.msi_post_free	= pseries_msi_post_free,
-};
-
 static void pseries_msi_shutdown(struct irq_data *d)
 {
 	d = d->parent_data;
@@ -459,18 +456,6 @@ static void pseries_msi_shutdown(struct irq_data *d)
 		d->chip->irq_shutdown(d);
 }
 
-static void pseries_msi_mask(struct irq_data *d)
-{
-	pci_msi_mask_irq(d);
-	irq_chip_mask_parent(d);
-}
-
-static void pseries_msi_unmask(struct irq_data *d)
-{
-	pci_msi_unmask_irq(d);
-	irq_chip_unmask_parent(d);
-}
-
 static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
 	struct msi_desc *entry = irq_data_get_msi_desc(data);
@@ -485,27 +470,39 @@ static void pseries_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 	entry->msg = *msg;
 }
 
-static struct irq_chip pseries_pci_msi_irq_chip = {
-	.name		= "pSeries-PCI-MSI",
-	.irq_shutdown	= pseries_msi_shutdown,
-	.irq_mask	= pseries_msi_mask,
-	.irq_unmask	= pseries_msi_unmask,
-	.irq_eoi	= irq_chip_eoi_parent,
-	.irq_write_msi_msg	= pseries_msi_write_msg,
-};
+static bool pseries_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+				      struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+	struct irq_chip *chip = info->chip;
 
+	if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+		return false;
 
-/*
- * Set MSI_FLAG_MSIX_CONTIGUOUS as there is no way to express to
- * firmware to request a discontiguous or non-zero based range of
- * MSI-X entries. Core code will reject such setup attempts.
- */
-static struct msi_domain_info pseries_msi_domain_info = {
-	.flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
-		  MSI_FLAG_MULTI_PCI_MSI  | MSI_FLAG_PCI_MSIX |
-		  MSI_FLAG_MSIX_CONTIGUOUS),
-	.ops   = &pseries_pci_msi_domain_ops,
-	.chip  = &pseries_pci_msi_irq_chip,
+	chip->irq_shutdown = pseries_msi_shutdown;
+	chip->irq_write_msi_msg	= pseries_msi_write_msg;
+
+	info->ops->msi_prepare = pseries_msi_ops_prepare;
+	info->ops->msi_post_free = pseries_msi_post_free;
+
+	return true;
+}
+
+#define PSERIES_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS	| \
+					MSI_FLAG_USE_DEF_CHIP_OPS	| \
+					MSI_FLAG_PCI_MSI_MASK_PARENT)
+#define PSERIES_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK		| \
+					 MSI_FLAG_PCI_MSIX		| \
+					 MSI_FLAG_MSIX_CONTIGUOUS	| \
+					 MSI_FLAG_MULTI_PCI_MSI)
+
+static const struct msi_parent_ops pseries_msi_parent_ops = {
+	.required_flags		= PSERIES_PCI_MSI_FLAGS_REQUIRED,
+	.supported_flags	= PSERIES_PCI_MSI_FLAGS_SUPPORTED,
+	.chip_flags		= MSI_CHIP_FLAG_SET_EOI,
+	.bus_select_token	= DOMAIN_BUS_NEXUS,
+	.bus_select_mask	= MATCH_PCI_MSI,
+	.prefix			= "pSeries-",
+	.init_dev_msi_info	= pseries_init_dev_msi_info,
 };
 
 static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
@@ -593,6 +590,7 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq
 }
 
 static const struct irq_domain_ops pseries_irq_domain_ops = {
+	.select	= msi_lib_irq_domain_select,
 	.alloc  = pseries_irq_domain_alloc,
 	.free   = pseries_irq_domain_free,
 };
@@ -601,30 +599,18 @@ static int __pseries_msi_allocate_domains(struct pci_controller *phb,
 					  unsigned int count)
 {
 	struct irq_domain *parent = irq_get_default_domain();
-
-	phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI",
-						       phb->global_number);
-	if (!phb->fwnode)
-		return -ENOMEM;
-
-	phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
-						      phb->fwnode,
-						      &pseries_irq_domain_ops, phb);
+	struct irq_domain_info info = {
+		.fwnode		= of_fwnode_handle(phb->dn),
+		.ops		= &pseries_irq_domain_ops,
+		.host_data	= phb,
+		.size		= count,
+		.parent		= parent,
+	};
+
+	phb->dev_domain = msi_create_parent_irq_domain(&info, &pseries_msi_parent_ops);
 	if (!phb->dev_domain) {
-		pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
-		       phb->dn, phb->global_number);
-		irq_domain_free_fwnode(phb->fwnode);
-		return -ENOMEM;
-	}
-
-	phb->msi_domain = pci_msi_create_irq_domain(of_fwnode_handle(phb->dn),
-						    &pseries_msi_domain_info,
-						    phb->dev_domain);
-	if (!phb->msi_domain) {
 		pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
 		       phb->dn, phb->global_number);
-		irq_domain_free_fwnode(phb->fwnode);
-		irq_domain_remove(phb->dev_domain);
 		return -ENOMEM;
 	}
 
@@ -646,12 +632,8 @@ int pseries_msi_allocate_domains(struct pci_controller *phb)
 
 void pseries_msi_free_domains(struct pci_controller *phb)
 {
-	if (phb->msi_domain)
-		irq_domain_remove(phb->msi_domain);
 	if (phb->dev_domain)
 		irq_domain_remove(phb->dev_domain);
-	if (phb->fwnode)
-		irq_domain_free_fwnode(phb->fwnode);
 }
 
 static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain
  2025-06-26 14:47 [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Nam Cao
                   ` (2 preceding siblings ...)
  2025-06-26 14:47 ` [PATCH 3/3] powerpc/pseries/msi: Switch to msi_create_parent_irq_domain() Nam Cao
@ 2025-07-03 16:37 ` Thomas Gleixner
  2025-07-22  8:35 ` Gautam Menghani
  4 siblings, 0 replies; 9+ messages in thread
From: Thomas Gleixner @ 2025-07-03 16:37 UTC (permalink / raw)
  To: Nam Cao, Marc Zyngier, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy, linuxppc-dev, linux-kernel
  Cc: Nam Cao

On Thu, Jun 26 2025 at 16:47, Nam Cao wrote:
> The solution is implementing per device MSI domains, this means the
> entities which provide global PCI/MSI domain so far have to implement MSI
> parent domain functionality instead.
>
> This series:
>
>    - Untangle XIVE driver from Powernv and Pseries drivers
>
>    - Convert the Powernv and Pseries drivers to implement MSI parent domain
>      functionality
>
> Nam Cao (3):
>   powerpc/xive: Untangle xive from child interrupt controller drivers
>   powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain()
>   powerpc/pseries/msi: Switch to msi_create_parent_irq_domain()

Gentle reminder @PPC people.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain
  2025-06-26 14:47 [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Nam Cao
                   ` (3 preceding siblings ...)
  2025-07-03 16:37 ` [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Thomas Gleixner
@ 2025-07-22  8:35 ` Gautam Menghani
  2025-07-22  9:24   ` Nam Cao
  4 siblings, 1 reply; 9+ messages in thread
From: Gautam Menghani @ 2025-07-22  8:35 UTC (permalink / raw)
  To: Nam Cao
  Cc: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel

Hi,

I am seeing a boot failure after applying this series on top of the pci
tree [1]. Note that this error was seen on a system where I have a
dedicated NVME. Systems without dedicated disk boot fine

[    2.119058] nvme nvme3: D3 entry latency set to 8 seconds
[    2.132609] xive: H_INT_GET_SOURCE_INFO lisn=0x1 failed -55
[    4.486307] nvme nvme0: D3 entry latency set to 10 seconds
[   28.193633] watchdog: BUG: soft lockup - CPU#280 stuck for 26s! [kworker/280:0:1436]
[   28.193637] CPU#280 Utilization every 4s during lockup:
[   28.193640]  #1: 101% system,          0% softirq,     0% hardirq,     0% idle
[   28.193648]  #2: 100% system,          0% softirq,     0% hardirq,     0% idle
[   28.193650]  #3: 100% system,          0% softirq,     0% hardirq,     0% idle
[   28.193653]  #4: 101% system,          0% softirq,     0% hardirq,     0% idle
[   28.193654]  #5: 100% system,          0% softirq,     0% hardirq,     0% idle
[   28.193657] Modules linked in: nvme nvme_core nvme_keyring nvme_auth pseries_wdt scsi_dh_rdac scsi_dh_emc scsi_dh_alua aes_gcm_p10_crypto crypto_simd cryptd
[   28.193672] CPU: 280 UID: 0 PID: 1436 Comm: kworker/280:0 Not tainted 6.16.0-rc1+ #5 VOLUNTARY
[   28.193675] Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.00 (NH1110_015) hv:phyp pSeries
[   28.193677] Workqueue: events work_for_cpu_fn
[   28.193684] NIP:  c0000000017d8a84 LR: c00000000032d168 CTR: c0000000001a3c20
[   28.193686] REGS: c000001c03b8b0d0 TRAP: 0900   Not tainted  (6.16.0-rc1+)
[   28.193687] MSR:  800000000280b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 24442220  XER: 00000010
[   28.193694] CFAR: 0000000000000000 IRQMASK: 0
[   28.193694] GPR00: 18000003852fe563 c000001c03b8b370 c000000001fb8100 c000001c297f1b0c
[   28.193694] GPR04: 0000000000000003 0000000000000009 0000000038c78f48 0000000000000004
[   28.193694] GPR08: ffffffffffffffff 0000000000000020 c000001c297f2b50 0000000000000003
[   28.193694] GPR12: c000001c297f2b00 c00000258fffeb00 c000000000296018 c000000003d7c040
[   28.193694] GPR16: 0000000000000000 0000000000000000 0000000000000000 c000001c03b8b8b0
[   28.193694] GPR20: c000001c02365800 c000000027da9000 c00000000298b9e8 0000000000000002
[   28.193694] GPR24: 0000000000000001 0000000000000001 000000000000003f 0000000000000001
[   28.193694] GPR28: 000000010000003e c000000001a4e298 0000000038c78f48 3fffffe3d680d500
[   28.193712] NIP [c0000000017d8a84] mtree_load+0x244/0x370
[   28.193717] LR [c00000000032d168] irq_to_desc+0x28/0x40
[   28.193721] Call Trace:
[   28.193722] [c000001c03b8b370] [0000000000000001] 0x1 (unreliable)
[   28.193727] [c000001c03b8b420] [c00000000032d168] irq_to_desc+0x28/0x40
[   28.193729] [c000001c03b8b440] [c0000000003367dc] irq_get_irq_data+0x1c/0x40
[   28.193733] [c000001c03b8b460] [c00000000033bbbc] irq_domain_free_irqs_hierarchy+0x5c/0xe0
[   28.193736] [c000001c03b8b4a0] [c0000000001eaa88] pseries_irq_domain_alloc+0x1d8/0x2e0
[   28.193740] [c000001c03b8b5c0] [c00000000033b790] irq_domain_alloc_irqs_parent+0x40/0xa0
[   28.193742] [c000001c03b8b620] [c0000000003418bc] msi_domain_alloc+0xcc/0x230
[   28.193744] [c000001c03b8b6a0] [c00000000033b824] irq_domain_alloc_irqs_hierarchy+0x34/0x90
[   28.193747] [c000001c03b8b700] [c00000000033d16c] irq_domain_alloc_irqs_locked+0x16c/0x5a0
[   28.193749] [c000001c03b8b7e0] [c00000000033dbc0] __irq_domain_alloc_irqs+0x70/0xd0
[   28.193751] [c000001c03b8b880] [c000000000342988] __msi_domain_alloc_irqs+0x208/0x510
[   28.193754] [c000001c03b8b940] [c0000000003445ec] msi_domain_alloc_irqs_all_locked+0x6c/0x100
[   28.193757] [c000001c03b8b9a0] [c000000000eca400] pci_msi_setup_msi_irqs+0x60/0x80
[   28.193761] [c000001c03b8b9c0] [c000000000ec8bcc] msix_setup_interrupts+0x18c/0x2f0
[   28.193764] [c000001c03b8baa0] [c000000000ec92ac] __pci_enable_msix_range+0x57c/0x840
[   28.193767] [c000001c03b8bb70] [c000000000ec6948] pci_alloc_irq_vectors_affinity+0xf8/0x1d0
[   28.193769] [c000001c03b8bc00] [c0080000131641cc] nvme_setup_io_queues+0x2c4/0x570 [nvme]
[   28.193776] [c000001c03b8bd00] [c008000013167e98] nvme_probe+0x340/0x450 [nvme]
[   28.193780] [c000001c03b8bda0] [c000000000eb6bb4] local_pci_probe+0x64/0xf0
[   28.193784] [c000001c03b8be20] [c000000000280804] work_for_cpu_fn+0x34/0x50
[   28.193786] [c000001c03b8be50] [c0000000002865c0] process_one_work+0x1f0/0x500
[   28.193788] [c000001c03b8bf00] [c00000000028800c] worker_thread+0x33c/0x510
[   28.193791] [c000001c03b8bf90] [c000000000296160] kthread+0x150/0x160
[   28.193793] [c000001c03b8bfe0] [c00000000000ded8] start_kernel_thread+0x14/0x18
[   28.193795] Code: 2c090001 4182fe5c 4bfffeb8 280b0002 79291d68 4081ffac 280b0003 39400000 4082ffb0 394c0050 7c6a482a 7c2004ac <e92c0000> 792905e4 7c2c4840 4082ffac



I eventually see a kernel OOPS

[  119.622966] xive: H_INT_GET_SOURCE_INFO lisn=0x1 failed -55
[  119.623008] Kernel attempted to read user page (0) - exploit attempt? (uid: 0)
[  119.623028] BUG: Kernel NULL pointer dereference on read at 0x00000000
[  119.623048] Faulting instruction address: 0xc0000000001a0e48
[  119.623056] Oops: Kernel access of bad area, sig: 11 [#1]
[  119.623062] LE PAGE_SIZE=64K MMU=Radix  SMP NR_CPUS=2048 NUMA pSeries
[  119.623074] Modules linked in: nvme nvme_core nvme_keyring nvme_auth pseries_wdt scsi_dh_rdac scsi_dh_emc scsi_dh_alua aes_gcm_p10_crypto crypto_simd cryptd
[  119.623096] CPU: 48 UID: 0 PID: 1 Comm: systemd Tainted: G             L      6.16.0-rc1+ #5 VOLUNTARY
[  119.623104] Tainted: [L]=SOFTLOCKUP
[  119.623108] Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.00 (NH1110_015) hv:phyp pSeries
[  119.623115] NIP:  c0000000001a0e48 LR: c0000000003334b8 CTR: c0000000001a0de0
[  119.623122] REGS: c000000008087080 TRAP: 0300   Tainted: G             L       (6.16.0-rc1+)
[  119.623129] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 24484408  XER: 00000155
[  119.623157] CFAR: c0000000001a0e78 DAR: 0000000000000000 DSISR: 40000000 IRQMASK: 1
[  119.623157] GPR00: c000000000333a98 c000000008087320 c000000001fb8100 c0000008ed903118
[  119.623157] GPR04: 0000000000000001 0000000000000061 0000000000000000 0000000000000000
[  119.623157] GPR08: c0000008ed903000 0000000002030001 0000000000000000 0000000000000000
[  119.623157] GPR12: c0000000001a0de0 c0000008cffc4700 0000000000000000 0000000000000000
[  119.623157] GPR16: c000000001997440 c000000061001400 c0000008ed432e80 0000000000500001
[  119.623157] GPR20: 000fffffffe00000 0000000000500001 ffffffffffffffff c000000001dd7478
[  119.623157] GPR24: 0000000000000000 c00000128de55000 c0000008ed903194 c0000008ed903248
[  119.623157] GPR28: 000000000000003a c000000000334168 c000000063929800 0000000000000001
[  119.623223] NIP [c0000000001a0e48] xive_irq_set_type+0x68/0x130
[  119.623230] LR [c0000000003334b8] __irq_set_trigger+0x88/0x270
[  119.623238] Call Trace:
[  119.623242] [c000000008087320] [c000000008087390] 0xc000000008087390 (unreliable)
[  119.623250] [c0000000080873b0] [c000000000333a98] __setup_irq+0x3f8/0x980
[  119.623257] [c000000008087450] [c000000000334168] request_threaded_irq+0x148/0x270
[  119.623265] [c0000000080874c0] [c000000000fb6ff4] notifier_add_irq+0x64/0x90
[  119.623274] [c0000000080874f0] [c000000000fb56a4] hvc_open+0x94/0x1b0
[  119.623281] [c000000008087570] [c000000000f81304] tty_open+0x1f4/0x7e0
[  119.623289] [c000000008087620] [c0000000007c6778] chrdev_open+0x158/0x390
[  119.623297] [c000000008087690] [c0000000007b59c4] do_dentry_open+0x294/0x790
[  119.623305] [c0000000080876f0] [c0000000007b816c] vfs_open+0x3c/0x140
[  119.623313] [c000000008087730] [c0000000007d75cc] do_open+0x35c/0x540
[  119.623322] [c000000008087790] [c0000000007dd080] path_openat+0x140/0x310
[  119.623328] [c000000008087800] [c0000000007dd324] do_filp_open+0xd4/0x1c0
[  119.623334] [c000000008087940] [c0000000007b87bc] do_sys_openat2+0xbc/0x160
[  119.623340] [c0000000080879c0] [c0000000007b8c1c] sys_openat+0x7c/0xd0
[  119.623346] [c000000008087a20] [c000000000032610] system_call_exception+0x160/0x310
[  119.623353] [c000000008087e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
[  119.623360] ---- interrupt: 3000 at 0x7fff7fd18694
[  119.623364] NIP:  00007fff7fd18694 LR: 00007fff7fd18694 CTR: 0000000000000000
[  119.623368] REGS: c000000008087e80 TRAP: 3000   Tainted: G             L       (6.16.0-rc1+)
[  119.623373] MSR:  800000000280f033 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 48488408  XER: 00000000
[  119.623384] IRQMASK: 0
[  119.623384] GPR00: 000000000000011e 00007fffc3826190 0000000000100000 ffffffffffffff9c
[  119.623384] GPR04: 00007fff8070cd78 0000000000080101 0000000000000000 0000000000000000
[  119.623384] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[  119.623384] GPR12: 0000000000000000 00007fff808d2b80 0000000000000000 00007fffc3826708
[  119.623384] GPR16: 00007fffc38266d0 00007fff80715230 0000000000000001 00000001183fc840
[  119.623384] GPR20: 00007fffc38266b0 00007fff807244a8 0000000000000006 00007fff807244a8
[  119.623384] GPR24: 0000000000000000 0000000000000000 00007fffc3826570 00007fff8070cd78
[  119.623384] GPR28: 0000000000080101 00007fffc3826278 0000000000000015 fffffffffffffff7
[  119.623430] NIP [00007fff7fd18694] 0x7fff7fd18694
[  119.623433] LR [00007fff7fd18694] 0x7fff7fd18694
[  119.623436] ---- interrupt: 3000
[  119.623439] Code: 55290036 91280000 e9030010 60420000 81280000 7d292378 91280000 e9030010 60420000 81280000 65290200 91280000 <e9270000> 5528fffe 552907bc 7c0a4000
[  119.623455] ---[ end trace 0000000000000000 ]---
[  119.625060] nvme nvme1: D3 entry latency set to 10 seconds
[  119.627351] pstore: backend (nvram) writing error (-1)
[  119.627358]
[  120.627361] note: systemd[1] exited with irqs disabled
[  120.627425] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
[  121.709848] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b ]---



[1] : https://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git/log/?h=controller/msi-parent


Thanks,
Gautam

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain
  2025-07-22  8:35 ` Gautam Menghani
@ 2025-07-22  9:24   ` Nam Cao
  2025-07-22 20:31     ` Nam Cao
  0 siblings, 1 reply; 9+ messages in thread
From: Nam Cao @ 2025-07-22  9:24 UTC (permalink / raw)
  To: Gautam Menghani
  Cc: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel

On Tue, Jul 22, 2025 at 02:05:55PM +0530, Gautam Menghani wrote:
> I am seeing a boot failure after applying this series on top of the pci
> tree [1]. Note that this error was seen on a system where I have a
> dedicated NVME. Systems without dedicated disk boot fine

Thanks for the report.

Using QEMU, I cannot reproduce the exact same problem, but I do observe a
different one. They are likely from the same root cause.

Let me investigate..

Nam

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain
  2025-07-22  9:24   ` Nam Cao
@ 2025-07-22 20:31     ` Nam Cao
  2025-07-23 10:57       ` Gautam Menghani
  0 siblings, 1 reply; 9+ messages in thread
From: Nam Cao @ 2025-07-22 20:31 UTC (permalink / raw)
  To: Gautam Menghani
  Cc: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel

> On Tue, Jul 22, 2025 at 02:05:55PM +0530, Gautam Menghani wrote:
> > I am seeing a boot failure after applying this series on top of the pci
> > tree [1]. Note that this error was seen on a system where I have a
> > dedicated NVME. Systems without dedicated disk boot fine
> 
> Thanks for the report.
> 
> Using QEMU, I cannot reproduce the exact same problem, but I do observe a
> different one. They are likely from the same root cause.
> 
> Let me investigate..

So the problem is due to the pair msi_prepare() and msi_post_free(). Before
this series, msi_prepare() is called whenever interrupt is allocated.
However, after this series, msi_prepare() is called only at domain
creation.

For most device drivers, this difference does not have any impact. However,
the NVME driver is slightly "special", it does this:

	1. Allocate interrupts
	2. Free interrupts
	3. Allocate interrupts again

Before this series:

	(1) calls msi_prepare()
	(2) calls msi_post_free()
	(3) calls msi_prepare() again

and it happens to work. However, after this series:

	(1) calls msi_prepare()
	(2) calls msi_post_free()
	(3) does not call either

and we are in trouble.

A simple solution is using msi_teardown() instead, which is called at
domain destruction. It makes more sense this way as well, because
msi_teardown() is supposed to reverse what msi_prepare() does.

This would also remove the only user of msi_post_free(), allowing us to
delete that callback.

The below patch fixes the problem that I saw with QEMU. Does it fix the
problem on your side as well?

Best regards,
Nam


diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 70be6e24427d..7da142dd5baa 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -441,12 +441,12 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
  * RTAS can not disable one MSI at a time. It's all or nothing. Do it
  * at the end after all IRQs have been freed.
  */
-static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev)
+static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
 {
-	if (WARN_ON_ONCE(!dev_is_pci(dev)))
-		return;
+	struct msi_desc *desc = arg->desc;
+	struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
 
-	rtas_disable_msi(to_pci_dev(dev));
+	rtas_disable_msi(pdev);
 }
 
 static void pseries_msi_shutdown(struct irq_data *d)
@@ -482,7 +482,7 @@ static bool pseries_init_dev_msi_info(struct device *dev, struct irq_domain *dom
 	chip->irq_write_msi_msg	= pseries_msi_write_msg;
 
 	info->ops->msi_prepare = pseries_msi_ops_prepare;
-	info->ops->msi_post_free = pseries_msi_post_free;
+	info->ops->msi_teardown = pseries_msi_ops_teardown;
 
 	return true;
 }

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain
  2025-07-22 20:31     ` Nam Cao
@ 2025-07-23 10:57       ` Gautam Menghani
  0 siblings, 0 replies; 9+ messages in thread
From: Gautam Menghani @ 2025-07-23 10:57 UTC (permalink / raw)
  To: Nam Cao
  Cc: Marc Zyngier, Thomas Gleixner, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy, linuxppc-dev,
	linux-kernel

On Tue, Jul 22, 2025 at 10:31:45PM +0200, Nam Cao wrote:
> > On Tue, Jul 22, 2025 at 02:05:55PM +0530, Gautam Menghani wrote:
> > > I am seeing a boot failure after applying this series on top of the pci
> > > tree [1]. Note that this error was seen on a system where I have a
> > > dedicated NVME. Systems without dedicated disk boot fine
> > 
> > Thanks for the report.
> > 
> > Using QEMU, I cannot reproduce the exact same problem, but I do observe a
> > different one. They are likely from the same root cause.
> > 
> > Let me investigate..
> 
> So the problem is due to the pair msi_prepare() and msi_post_free(). Before
> this series, msi_prepare() is called whenever interrupt is allocated.
> However, after this series, msi_prepare() is called only at domain
> creation.
> 
> For most device drivers, this difference does not have any impact. However,
> the NVME driver is slightly "special", it does this:
> 
> 	1. Allocate interrupts
> 	2. Free interrupts
> 	3. Allocate interrupts again
> 
> Before this series:
> 
> 	(1) calls msi_prepare()
> 	(2) calls msi_post_free()
> 	(3) calls msi_prepare() again
> 
> and it happens to work. However, after this series:
> 
> 	(1) calls msi_prepare()
> 	(2) calls msi_post_free()
> 	(3) does not call either
> 
> and we are in trouble.
> 
> A simple solution is using msi_teardown() instead, which is called at
> domain destruction. It makes more sense this way as well, because
> msi_teardown() is supposed to reverse what msi_prepare() does.
> 
> This would also remove the only user of msi_post_free(), allowing us to
> delete that callback.
> 
> The below patch fixes the problem that I saw with QEMU. Does it fix the
> problem on your side as well?
> 
> Best regards,
> Nam
> 
> 
> diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
> index 70be6e24427d..7da142dd5baa 100644
> --- a/arch/powerpc/platforms/pseries/msi.c
> +++ b/arch/powerpc/platforms/pseries/msi.c
> @@ -441,12 +441,12 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev
>   * RTAS can not disable one MSI at a time. It's all or nothing. Do it
>   * at the end after all IRQs have been freed.
>   */
> -static void pseries_msi_post_free(struct irq_domain *domain, struct device *dev)
> +static void pseries_msi_ops_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
>  {
> -	if (WARN_ON_ONCE(!dev_is_pci(dev)))
> -		return;
> +	struct msi_desc *desc = arg->desc;
> +	struct pci_dev *pdev = msi_desc_to_pci_dev(desc);
>  
> -	rtas_disable_msi(to_pci_dev(dev));
> +	rtas_disable_msi(pdev);
>  }
>  
>  static void pseries_msi_shutdown(struct irq_data *d)
> @@ -482,7 +482,7 @@ static bool pseries_init_dev_msi_info(struct device *dev, struct irq_domain *dom
>  	chip->irq_write_msi_msg	= pseries_msi_write_msg;
>  
>  	info->ops->msi_prepare = pseries_msi_ops_prepare;
> -	info->ops->msi_post_free = pseries_msi_post_free;
> +	info->ops->msi_teardown = pseries_msi_ops_teardown;
>  
>  	return true;
>  }

Hi Nam,

The boot issue is fixed with this diff. I'll do some more testing on
this series and will post more updates.

Thanks,
Gautam

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2025-07-23 10:58 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-26 14:47 [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Nam Cao
2025-06-26 14:47 ` [PATCH 1/3] powerpc/xive: Untangle xive from child interrupt controller drivers Nam Cao
2025-06-26 14:47 ` [PATCH 2/3] powerpc/powernv/pci: Switch to use msi_create_parent_irq_domain() Nam Cao
2025-06-26 14:47 ` [PATCH 3/3] powerpc/pseries/msi: Switch to msi_create_parent_irq_domain() Nam Cao
2025-07-03 16:37 ` [PATCH 0/3] powerpc: Cleanup and convert to MSI parent domain Thomas Gleixner
2025-07-22  8:35 ` Gautam Menghani
2025-07-22  9:24   ` Nam Cao
2025-07-22 20:31     ` Nam Cao
2025-07-23 10:57       ` Gautam Menghani

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).