[PATCH 0/2] PCI: brcmstb: Add panic/die handler to driver

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] PCI: brcmstb: Add panic/die handler to driver
@ 2025-06-13 22:08 Jim Quinlan
  2025-06-13 22:08 ` [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active Jim Quinlan
  2025-06-13 22:08 ` [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
  0 siblings, 2 replies; 16+ messages in thread
From: Jim Quinlan @ 2025-06-13 22:08 UTC (permalink / raw)
  To: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, james.quinlan
  Cc: moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	Rob Herring

The first commit sets up a field variable and spinlock to indicate whether
the PCIe bridge is active.  The second commit builds upon the first and
adds a "die" handler to the driver, which, when invoked, prints out a
summary of any pending PCIe errors.  The "die" handler is careful not to
access any registers unless the bridge is active.

Jim Quinlan (2):
  PCI: brcmstb: Add a way to indicate if PCIe bridge is active
  PCI: brcmstb: Add panic/die handler to driver

 drivers/pci/controller/pcie-brcmstb.c | 193 +++++++++++++++++++++++++-
 1 file changed, 188 insertions(+), 5 deletions(-)


base-commit: 18531f4d1c8c47c4796289dbbc1ab657ffa063d2
-- 
2.34.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active
  2025-06-13 22:08 [PATCH 0/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
@ 2025-06-13 22:08 ` Jim Quinlan
  2025-06-13 23:23   ` Florian Fainelli
  2025-08-06 19:14   ` Bjorn Helgaas
  2025-06-13 22:08 ` [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
  1 sibling, 2 replies; 16+ messages in thread
From: Jim Quinlan @ 2025-06-13 22:08 UTC (permalink / raw)
  To: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, james.quinlan
  Cc: Florian Fainelli, Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

In a future commit, a new handler will be introduced that in part does
reads and writes to some of the PCIe registers.  When this handler is
invoked, it is paramount that it does not do these register accesses when
the PCIe bridge is inactive, as this will cause CPU abort errors.

To solve this we keep a spinlock that guards a variable which indicates
whether the bridge is on or off.  When the bridge is on, access of the PCIe
HW registers may proceed.

Since there are multiple ways to reset the bridge, we introduce a general
function to obtain the spinlock, call the specific function that is used
for the specific SoC, sets the bridge active indicator variable, and
releases the spinlock.

Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>
---
 drivers/pci/controller/pcie-brcmstb.c | 40 +++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
index 92887b394eb4..400854c893d8 100644
--- a/drivers/pci/controller/pcie-brcmstb.c
+++ b/drivers/pci/controller/pcie-brcmstb.c
@@ -29,6 +29,7 @@
 #include <linux/reset.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/types.h>
 
@@ -254,6 +255,7 @@ struct pcie_cfg_data {
 	int (*perst_set)(struct brcm_pcie *pcie, u32 val);
 	int (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val);
 	int (*post_setup)(struct brcm_pcie *pcie);
+	bool has_err_report;
 };
 
 struct subdev_regulators {
@@ -299,6 +301,8 @@ struct brcm_pcie {
 	struct subdev_regulators *sr;
 	bool			ep_wakeup_capable;
 	const struct pcie_cfg_data	*cfg;
+	bool			bridge_on;
+	spinlock_t		bridge_lock;
 };
 
 static inline bool is_bmips(const struct brcm_pcie *pcie)
@@ -306,6 +310,24 @@ static inline bool is_bmips(const struct brcm_pcie *pcie)
 	return pcie->cfg->soc_base == BCM7435 || pcie->cfg->soc_base == BCM7425;
 }
 
+static inline int brcm_pcie_bridge_sw_init_set(struct brcm_pcie *pcie, u32 val)
+{
+	unsigned long flags;
+	int ret;
+
+	if (pcie->cfg->has_err_report)
+		spin_lock_irqsave(&pcie->bridge_lock, flags);
+
+	ret = pcie->cfg->bridge_sw_init_set(pcie, val);
+	if (ret)
+		pcie->bridge_on = !val;
+
+	if (pcie->cfg->has_err_report)
+		spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+
+	return ret;
+}
+
 /*
  * This is to convert the size of the inbound "BAR" region to the
  * non-linear values of PCIE_X_MISC_RC_BAR[123]_CONFIG_LO.SIZE
@@ -1078,7 +1100,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
 	int memc, ret;
 
 	/* Reset the bridge */
-	ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
+	ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
 	if (ret)
 		return ret;
 
@@ -1094,7 +1116,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
 	usleep_range(100, 200);
 
 	/* Take the bridge out of reset */
-	ret = pcie->cfg->bridge_sw_init_set(pcie, 0);
+	ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
 	if (ret)
 		return ret;
 
@@ -1545,7 +1567,7 @@ static int brcm_pcie_turn_off(struct brcm_pcie *pcie)
 
 	if (!(pcie->cfg->quirks & CFG_QUIRK_AVOID_BRIDGE_SHUTDOWN))
 		/* Shutdown PCIe bridge */
-		ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
+		ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
 
 	return ret;
 }
@@ -1633,7 +1655,9 @@ static int brcm_pcie_resume_noirq(struct device *dev)
 		goto err_reset;
 
 	/* Take bridge out of reset so we can access the SERDES reg */
-	pcie->cfg->bridge_sw_init_set(pcie, 0);
+	ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
+	if (ret)
+		goto err_reset;
 
 	/* SERDES_IDDQ = 0 */
 	tmp = readl(base + HARD_DEBUG(pcie));
@@ -1901,7 +1925,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
 	if (ret)
 		return dev_err_probe(&pdev->dev, ret, "could not enable clock\n");
 
-	pcie->cfg->bridge_sw_init_set(pcie, 0);
+	ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
+	if (ret)
+		return dev_err_probe(&pdev->dev, ret,
+				     "could not un-reset the bridge\n");
 
 	if (pcie->swinit_reset) {
 		ret = reset_control_assert(pcie->swinit_reset);
@@ -1976,6 +2003,9 @@ static int brcm_pcie_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	if (pcie->cfg->has_err_report)
+		spin_lock_init(&pcie->bridge_lock);
+
 	return 0;
 
 fail:
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-06-13 22:08 [PATCH 0/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
  2025-06-13 22:08 ` [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active Jim Quinlan
@ 2025-06-13 22:08 ` Jim Quinlan
  2025-06-13 23:28   ` Florian Fainelli
                     ` (2 more replies)
  1 sibling, 3 replies; 16+ messages in thread
From: Jim Quinlan @ 2025-06-13 22:08 UTC (permalink / raw)
  To: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, james.quinlan
  Cc: Florian Fainelli, Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
7216 and its descendants -- have new HW that identifies error details.

This simple handler determines if the PCIe controller was the cause of the
abort and if so, prints out diagnostic info.  Unfortunately, an abort still
occurs.

Care is taken to read the error registers only when the PCIe bridge is
active and the PCIe registers are acceptable.  Otherwise, a "die" event
caused by something other than the PCIe could cause an abort if the PCIe
"die" handler tried to access registers when the bridge is off.

Example error output:
  brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
  brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0

Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>
---
 drivers/pci/controller/pcie-brcmstb.c | 155 +++++++++++++++++++++++++-
 1 file changed, 154 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
index 400854c893d8..abc56acad1fe 100644
--- a/drivers/pci/controller/pcie-brcmstb.c
+++ b/drivers/pci/controller/pcie-brcmstb.c
@@ -13,15 +13,18 @@
 #include <linux/ioport.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqdomain.h>
+#include <linux/kdebug.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/msi.h>
+#include <linux/notifier.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
+#include <linux/panic_notifier.h>
 #include <linux/pci.h>
 #include <linux/pci-ecam.h>
 #include <linux/printk.h>
@@ -151,6 +154,39 @@
 #define  MSI_INT_MASK_SET		0x10
 #define  MSI_INT_MASK_CLR		0x14
 
+/* Error report registers */
+#define PCIE_OUTB_ERR_TREAT				0x6000
+#define  PCIE_OUTB_ERR_TREAT_CONFIG_MASK		0x1
+#define  PCIE_OUTB_ERR_TREAT_MEM_MASK			0x2
+#define PCIE_OUTB_ERR_VALID				0x6004
+#define PCIE_OUTB_ERR_CLEAR				0x6008
+#define PCIE_OUTB_ERR_ACC_INFO				0x600c
+#define  PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK		0x01
+#define  PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK		0x02
+#define  PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK		0x04
+#define  PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK		0x10
+#define  PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK		0xff00
+#define PCIE_OUTB_ERR_ACC_ADDR				0x6010
+#define PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK			0xff00000
+#define PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK			0xf8000
+#define PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK		0x7000
+#define PCIE_OUTB_ERR_ACC_ADDR_REG_MASK			0xfff
+#define PCIE_OUTB_ERR_CFG_CAUSE				0x6014
+#define  PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK		0x40
+#define  PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK		0x20
+#define  PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK	0x10
+#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK	0x4
+#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK	0x2
+#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK	0x1
+#define PCIE_OUTB_ERR_MEM_ADDR_LO			0x6018
+#define PCIE_OUTB_ERR_MEM_ADDR_HI			0x601c
+#define PCIE_OUTB_ERR_MEM_CAUSE				0x6020
+#define  PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK		0x40
+#define  PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK		0x20
+#define  PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK	0x10
+#define  PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK	0x2
+#define  PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK		0x1
+
 #define  PCIE_RGR1_SW_INIT_1_PERST_MASK			0x1
 #define  PCIE_RGR1_SW_INIT_1_PERST_SHIFT		0x0
 
@@ -301,6 +337,8 @@ struct brcm_pcie {
 	struct subdev_regulators *sr;
 	bool			ep_wakeup_capable;
 	const struct pcie_cfg_data	*cfg;
+	struct notifier_block	die_notifier;
+	struct notifier_block	panic_notifier;
 	bool			bridge_on;
 	spinlock_t		bridge_lock;
 };
@@ -1711,6 +1749,115 @@ static int brcm_pcie_resume_noirq(struct device *dev)
 	return ret;
 }
 
+/* Dump out PCIe errors on die or panic */
+static int _brcm_pcie_dump_err(struct brcm_pcie *pcie,
+			       const char *type)
+{
+	void __iomem *base = pcie->base;
+	int i, is_cfg_err, is_mem_err, lanes;
+	char *width_str, *direction_str, lanes_str[9];
+	u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pcie->bridge_lock, flags);
+	/* Don't access registers when the bridge is off */
+	if (!pcie->bridge_on || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
+		spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+		return NOTIFY_DONE;
+	}
+
+	/* Read all necessary registers so we can release the spinlock ASAP */
+	info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
+	is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK);
+	is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK);
+	if (is_cfg_err) {
+		cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
+		cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
+	}
+	if (is_mem_err) {
+		mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
+		lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
+		hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
+	}
+	/* We've got all of the info, clear the error */
+	writel(1, base + PCIE_OUTB_ERR_CLEAR);
+	spin_unlock_irqrestore(&pcie->bridge_lock, flags);
+
+	dev_err(pcie->dev, "handling %s error notification\n", type);
+	width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK) ? "64bit" : "32bit";
+	direction_str = (info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK) ? "Write" : "Read";
+	lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK, info);
+	for (i = 0, lanes_str[8] = 0; i < 8; i++)
+		lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
+
+	if (is_cfg_err) {
+		int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK, cfg_addr);
+		int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK, cfg_addr);
+		int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK, cfg_addr);
+		int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG_MASK, cfg_addr);
+
+		dev_err(pcie->dev, "Error: CFG Acc, %s, %s, Bus=%d, Dev=%d, Fun=%d, Reg=0x%x, lanes=%s\n",
+			width_str, direction_str, bus, dev, func, reg, lanes_str);
+		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
+			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK),
+			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK),
+			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK),
+			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK),
+			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK),
+			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK));
+	}
+
+	if (is_mem_err) {
+		u64 addr = ((u64)hi << 32) | (u64)lo;
+
+		dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
+			width_str, direction_str, addr, lanes_str);
+		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
+			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK),
+			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK),
+			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK),
+			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK),
+			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK));
+	}
+
+	return NOTIFY_OK;
+}
+
+static int brcm_pcie_die_notify_cb(struct notifier_block *self,
+				   unsigned long v, void *p)
+{
+	struct brcm_pcie *pcie =
+		container_of(self, struct brcm_pcie, die_notifier);
+
+	return _brcm_pcie_dump_err(pcie, "Die");
+}
+
+static int brcm_pcie_panic_notify_cb(struct notifier_block *self,
+				     unsigned long v, void *p)
+{
+	struct brcm_pcie *pcie =
+		container_of(self, struct brcm_pcie, panic_notifier);
+
+	return _brcm_pcie_dump_err(pcie, "Panic");
+}
+
+static void brcm_register_die_notifiers(struct brcm_pcie *pcie)
+{
+	pcie->panic_notifier.notifier_call = brcm_pcie_panic_notify_cb;
+	atomic_notifier_chain_register(&panic_notifier_list,
+				       &pcie->panic_notifier);
+
+	pcie->die_notifier.notifier_call = brcm_pcie_die_notify_cb;
+	register_die_notifier(&pcie->die_notifier);
+}
+
+static void brcm_unregister_die_notifiers(struct brcm_pcie *pcie)
+{
+	unregister_die_notifier(&pcie->die_notifier);
+	atomic_notifier_chain_unregister(&panic_notifier_list,
+					 &pcie->panic_notifier);
+}
+
 static void __brcm_pcie_remove(struct brcm_pcie *pcie)
 {
 	brcm_msi_remove(pcie);
@@ -1729,6 +1876,9 @@ static void brcm_pcie_remove(struct platform_device *pdev)
 
 	pci_stop_root_bus(bridge->bus);
 	pci_remove_root_bus(bridge->bus);
+	if (pcie->cfg->has_err_report)
+		brcm_unregister_die_notifiers(pcie);
+
 	__brcm_pcie_remove(pcie);
 }
 
@@ -1829,6 +1979,7 @@ static const struct pcie_cfg_data bcm7216_cfg = {
 	.bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_7278,
 	.has_phy	= true,
 	.num_inbound_wins = 3,
+	.has_err_report = true,
 };
 
 static const struct pcie_cfg_data bcm7712_cfg = {
@@ -2003,8 +2154,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	if (pcie->cfg->has_err_report)
+	if (pcie->cfg->has_err_report) {
 		spin_lock_init(&pcie->bridge_lock);
+		brcm_register_die_notifiers(pcie);
+	}
 
 	return 0;
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active
  2025-06-13 22:08 ` [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active Jim Quinlan
@ 2025-06-13 23:23   ` Florian Fainelli
  2025-08-06 19:14   ` Bjorn Helgaas
  1 sibling, 0 replies; 16+ messages in thread
From: Florian Fainelli @ 2025-06-13 23:23 UTC (permalink / raw)
  To: Jim Quinlan, linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024
  Cc: Florian Fainelli, Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On 6/13/25 15:08, Jim Quinlan wrote:
> In a future commit, a new handler will be introduced that in part does
> reads and writes to some of the PCIe registers.  When this handler is
> invoked, it is paramount that it does not do these register accesses when
> the PCIe bridge is inactive, as this will cause CPU abort errors.
> 
> To solve this we keep a spinlock that guards a variable which indicates
> whether the bridge is on or off.  When the bridge is on, access of the PCIe
> HW registers may proceed.
> 
> Since there are multiple ways to reset the bridge, we introduce a general
> function to obtain the spinlock, call the specific function that is used
> for the specific SoC, sets the bridge active indicator variable, and
> releases the spinlock.
> 
> Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>

Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
-- 
Florian


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-06-13 22:08 ` [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
@ 2025-06-13 23:28   ` Florian Fainelli
  2025-08-06 18:15   ` Bjorn Helgaas
  2025-08-06 19:14   ` Bjorn Helgaas
  2 siblings, 0 replies; 16+ messages in thread
From: Florian Fainelli @ 2025-06-13 23:28 UTC (permalink / raw)
  To: Jim Quinlan, linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024
  Cc: Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On 6/13/25 15:08, Jim Quinlan wrote:
> Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> 7216 and its descendants -- have new HW that identifies error details.
> 
> This simple handler determines if the PCIe controller was the cause of the
> abort and if so, prints out diagnostic info.  Unfortunately, an abort still
> occurs.
> 
> Care is taken to read the error registers only when the PCIe bridge is
> active and the PCIe registers are acceptable.  Otherwise, a "die" event
> caused by something other than the PCIe could cause an abort if the PCIe
> "die" handler tried to access registers when the bridge is off.
> 
> Example error output:
>    brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
>    brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0
> 
> Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>

Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
-- 
Florian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-06-13 22:08 ` [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
  2025-06-13 23:28   ` Florian Fainelli
@ 2025-08-06 18:15   ` Bjorn Helgaas
  2025-08-06 18:38     ` Jim Quinlan
  2025-08-06 19:14   ` Bjorn Helgaas
  2 siblings, 1 reply; 16+ messages in thread
From: Bjorn Helgaas @ 2025-08-06 18:15 UTC (permalink / raw)
  To: Jim Quinlan
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On Fri, Jun 13, 2025 at 06:08:43PM -0400, Jim Quinlan wrote:
> Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> 7216 and its descendants -- have new HW that identifies error details.

What's the long term plan for this?  This abort is a huge problem that
we're seeing across arm64 platforms.  Forcing a panic and reboot for
every uncorrectable error is pretty hard to deal with.

Is there a plan to someday recover from these aborts?  Or change the
hardware so it can at least be configured to return ~0 data after
logging the error in the hardware registers?

> This simple handler determines if the PCIe controller was the cause of the
> abort and if so, prints out diagnostic info.  Unfortunately, an abort still
> occurs.
> 
> Care is taken to read the error registers only when the PCIe bridge is
> active and the PCIe registers are acceptable.  Otherwise, a "die" event
> caused by something other than the PCIe could cause an abort if the PCIe
> "die" handler tried to access registers when the bridge is off.

Checking whether the bridge is active is a "mostly-works" situation
since it's always racy.

> Example error output:
>   brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
>   brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-06 18:15   ` Bjorn Helgaas
@ 2025-08-06 18:38     ` Jim Quinlan
  2025-08-06 18:50       ` Bjorn Helgaas
  0 siblings, 1 reply; 16+ messages in thread
From: Jim Quinlan @ 2025-08-06 18:38 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

[-- Attachment #1: Type: text/plain, Size: 2419 bytes --]

On Wed, Aug 6, 2025 at 2:15 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Fri, Jun 13, 2025 at 06:08:43PM -0400, Jim Quinlan wrote:
> > Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> > by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> > 7216 and its descendants -- have new HW that identifies error details.
>
> What's the long term plan for this?  This abort is a huge problem that
> we're seeing across arm64 platforms.  Forcing a panic and reboot for
> every uncorrectable error is pretty hard to deal with.


Hello Bjorn,
Are you referring to STB/CM systems, Rpi, or something else altogether?

>
> Is there a plan to someday recover from these aborts?  Or change the
> hardware so it can at least be configured to return ~0 data after
> logging the error in the hardware registers?

Some of our upcoming chips will have the ability to do nothing on
errant PCIe writes and return 0xffffffff on errant PCIe reads.   But
none of our STB/CM chips do this currently.   I've been asking for
this behavior for years but I have limited influence on what happens
in HW.

>
>
> > This simple handler determines if the PCIe controller was the cause of the
> > abort and if so, prints out diagnostic info.  Unfortunately, an abort still
> > occurs.
> >
> > Care is taken to read the error registers only when the PCIe bridge is
> > active and the PCIe registers are acceptable.  Otherwise, a "die" event
> > caused by something other than the PCIe could cause an abort if the PCIe
> > "die" handler tried to access registers when the bridge is off.
>
> Checking whether the bridge is active is a "mostly-works" situation
> since it's always racy.

I'm not sure I understand the "racy" comment.  If the PCIe bridge is
off, we do not read the PCIe error registers.  In this case, PCIe is
probably not the cause of the panic.   In the rare case the PCIe
bridge is off  and it was the PCIe that caused the panic, nothing gets
reported, and this is where we are without this commit.  Perhaps this
is what you mean by "mostly-works".  But this is the best that can be
done with SW given our HW.

Regards,
Jim Quinlan
Broadcom STB/CM
>
>
> > Example error output:
> >   brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
> >   brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4197 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-06 18:38     ` Jim Quinlan
@ 2025-08-06 18:50       ` Bjorn Helgaas
  2025-08-06 19:16         ` Jim Quinlan
  2025-08-06 20:41         ` Florian Fainelli
  0 siblings, 2 replies; 16+ messages in thread
From: Bjorn Helgaas @ 2025-08-06 18:50 UTC (permalink / raw)
  To: Jim Quinlan
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On Wed, Aug 06, 2025 at 02:38:12PM -0400, Jim Quinlan wrote:
> On Wed, Aug 6, 2025 at 2:15 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> >
> > On Fri, Jun 13, 2025 at 06:08:43PM -0400, Jim Quinlan wrote:
> > > Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> > > by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> > > 7216 and its descendants -- have new HW that identifies error details.
> >
> > What's the long term plan for this?  This abort is a huge problem that
> > we're seeing across arm64 platforms.  Forcing a panic and reboot for
> > every uncorrectable error is pretty hard to deal with.
> 
> Are you referring to STB/CM systems, Rpi, or something else altogether?

Just in general.  I saw this recently with a Nuvoton NPCM8xx PCIe
controller.  I'm not an arm64 guy, but I've been told that these
aborts are basically unrecoverable from a kernel perspective.  For
some reason several PCIe controllers intended for arm64 seem to raise
aborts on PCIe errors.  At the moment, that means we can't recover
from errors like surprise unplugs and other things that *should* be
recoverable (perhaps at the cost of resetting or disabling a PCIe
device).

> > Is there a plan to someday recover from these aborts?  Or change the
> > hardware so it can at least be configured to return ~0 data after
> > logging the error in the hardware registers?
> 
> Some of our upcoming chips will have the ability to do nothing on
> errant PCIe writes and return 0xffffffff on errant PCIe reads.   But
> none of our STB/CM chips do this currently.   I've been asking for
> this behavior for years but I have limited influence on what happens
> in HW.

Fingers crossed for either that or some other way to make these things
recoverable.

> > > This simple handler determines if the PCIe controller was the
> > > cause of the abort and if so, prints out diagnostic info.
> > > Unfortunately, an abort still occurs.
> > >
> > > Care is taken to read the error registers only when the PCIe
> > > bridge is active and the PCIe registers are acceptable.
> > > Otherwise, a "die" event caused by something other than the PCIe
> > > could cause an abort if the PCIe "die" handler tried to access
> > > registers when the bridge is off.
> >
> > Checking whether the bridge is active is a "mostly-works"
> > situation since it's always racy.
> 
> I'm not sure I understand the "racy" comment.  If the PCIe bridge is
> off, we do not read the PCIe error registers.  In this case, PCIe is
> probably not the cause of the panic.   In the rare case the PCIe
> bridge is off  and it was the PCIe that caused the panic, nothing
> gets reported, and this is where we are without this commit.
> Perhaps this is what you mean by "mostly-works".  But this is the
> best that can be done with SW given our HW.

Right, my fault.  The error report registers don't look like standard
PCIe things, so I suppose they are on the host side, not the PCIe
side, so they're probably guaranteed to be accessible and non-racy
unless the bridge is in reset.

Bjorn

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active
  2025-06-13 22:08 ` [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active Jim Quinlan
  2025-06-13 23:23   ` Florian Fainelli
@ 2025-08-06 19:14   ` Bjorn Helgaas
  2025-08-07 18:03     ` Jim Quinlan
  1 sibling, 1 reply; 16+ messages in thread
From: Bjorn Helgaas @ 2025-08-06 19:14 UTC (permalink / raw)
  To: Jim Quinlan
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On Fri, Jun 13, 2025 at 06:08:42PM -0400, Jim Quinlan wrote:
> In a future commit, a new handler will be introduced that in part does
> reads and writes to some of the PCIe registers.  When this handler is
> invoked, it is paramount that it does not do these register accesses when
> the PCIe bridge is inactive, as this will cause CPU abort errors.
> 
> To solve this we keep a spinlock that guards a variable which indicates
> whether the bridge is on or off.  When the bridge is on, access of the PCIe
> HW registers may proceed.
> 
> Since there are multiple ways to reset the bridge, we introduce a general
> function to obtain the spinlock, call the specific function that is used
> for the specific SoC, sets the bridge active indicator variable, and
> releases the spinlock.
> 
> Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>
> ---
>  drivers/pci/controller/pcie-brcmstb.c | 40 +++++++++++++++++++++++----
>  1 file changed, 35 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
> index 92887b394eb4..400854c893d8 100644
> --- a/drivers/pci/controller/pcie-brcmstb.c
> +++ b/drivers/pci/controller/pcie-brcmstb.c
> @@ -29,6 +29,7 @@
>  #include <linux/reset.h>
>  #include <linux/sizes.h>
>  #include <linux/slab.h>
> +#include <linux/spinlock.h>
>  #include <linux/string.h>
>  #include <linux/types.h>
>  
> @@ -254,6 +255,7 @@ struct pcie_cfg_data {
>  	int (*perst_set)(struct brcm_pcie *pcie, u32 val);
>  	int (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val);
>  	int (*post_setup)(struct brcm_pcie *pcie);
> +	bool has_err_report;

It doesn't look worth it to me to add this.  It only avoids locking in
a non-performance path.

>  };
>  
>  struct subdev_regulators {
> @@ -299,6 +301,8 @@ struct brcm_pcie {
>  	struct subdev_regulators *sr;
>  	bool			ep_wakeup_capable;
>  	const struct pcie_cfg_data	*cfg;
> +	bool			bridge_on;
> +	spinlock_t		bridge_lock;
>  };
>  
>  static inline bool is_bmips(const struct brcm_pcie *pcie)
> @@ -306,6 +310,24 @@ static inline bool is_bmips(const struct brcm_pcie *pcie)
>  	return pcie->cfg->soc_base == BCM7435 || pcie->cfg->soc_base == BCM7425;
>  }
>  
> +static inline int brcm_pcie_bridge_sw_init_set(struct brcm_pcie *pcie, u32 val)
> +{
> +	unsigned long flags;
> +	int ret;
> +
> +	if (pcie->cfg->has_err_report)
> +		spin_lock_irqsave(&pcie->bridge_lock, flags);
> +
> +	ret = pcie->cfg->bridge_sw_init_set(pcie, val);
> +	if (ret)
> +		pcie->bridge_on = !val;

AFAICT, .bridge_sw_init_set(1) asserts reset, .bridge_sw_init_set(0)
deasserts reset, and it returns 0 for success, so I'm confused about
this.  If either assert or deassert failed (ret != 0), I guess we
don't know the state of the bridge and can't assume it's active, so I
would have expected something like:

  ret = pcie->cfg->bridge_sw_init_set(pcie, val);
  if (ret)
    pcie->bridge_on = false;
  else
    pcie->bridge_on = !val;

Tangent: the last "return ret" in brcm_pcie_bridge_sw_init_set_generic()
should be "return 0" and drop the unnecessary initialization of "ret".

And the code there would be vastly improved by using FIELD_PREP() or
u32p_replace_bits() and getting rid of the shifting.

> +	if (pcie->cfg->has_err_report)
> +		spin_unlock_irqrestore(&pcie->bridge_lock, flags);
> +
> +	return ret;
> +}
> +
>  /*
>   * This is to convert the size of the inbound "BAR" region to the
>   * non-linear values of PCIE_X_MISC_RC_BAR[123]_CONFIG_LO.SIZE
> @@ -1078,7 +1100,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
>  	int memc, ret;
>  
>  	/* Reset the bridge */
> -	ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
> +	ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
>  	if (ret)
>  		return ret;
>  
> @@ -1094,7 +1116,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
>  	usleep_range(100, 200);
>  
>  	/* Take the bridge out of reset */
> -	ret = pcie->cfg->bridge_sw_init_set(pcie, 0);
> +	ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
>  	if (ret)
>  		return ret;
>  
> @@ -1545,7 +1567,7 @@ static int brcm_pcie_turn_off(struct brcm_pcie *pcie)
>  
>  	if (!(pcie->cfg->quirks & CFG_QUIRK_AVOID_BRIDGE_SHUTDOWN))
>  		/* Shutdown PCIe bridge */
> -		ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
> +		ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
>  
>  	return ret;
>  }
> @@ -1633,7 +1655,9 @@ static int brcm_pcie_resume_noirq(struct device *dev)
>  		goto err_reset;
>  
>  	/* Take bridge out of reset so we can access the SERDES reg */
> -	pcie->cfg->bridge_sw_init_set(pcie, 0);
> +	ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
> +	if (ret)
> +		goto err_reset;
>  
>  	/* SERDES_IDDQ = 0 */
>  	tmp = readl(base + HARD_DEBUG(pcie));
> @@ -1901,7 +1925,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
>  	if (ret)
>  		return dev_err_probe(&pdev->dev, ret, "could not enable clock\n");
>  
> -	pcie->cfg->bridge_sw_init_set(pcie, 0);
> +	ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
> +	if (ret)
> +		return dev_err_probe(&pdev->dev, ret,
> +				     "could not un-reset the bridge\n");

"un-reset" doesn't mean anything to me.  Is this the same as "could
not take the bridge out of reset"?  Or maybe "could not deassert
bridge reset"?

>  	if (pcie->swinit_reset) {
>  		ret = reset_control_assert(pcie->swinit_reset);
> @@ -1976,6 +2003,9 @@ static int brcm_pcie_probe(struct platform_device *pdev)
>  		return ret;
>  	}
>  
> +	if (pcie->cfg->has_err_report)
> +		spin_lock_init(&pcie->bridge_lock);
> +
>  	return 0;
>  
>  fail:
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-06-13 22:08 ` [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
  2025-06-13 23:28   ` Florian Fainelli
  2025-08-06 18:15   ` Bjorn Helgaas
@ 2025-08-06 19:14   ` Bjorn Helgaas
  2 siblings, 0 replies; 16+ messages in thread
From: Bjorn Helgaas @ 2025-08-06 19:14 UTC (permalink / raw)
  To: Jim Quinlan
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On Fri, Jun 13, 2025 at 06:08:43PM -0400, Jim Quinlan wrote:
> Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> 7216 and its descendants -- have new HW that identifies error details.
> 
> This simple handler determines if the PCIe controller was the cause of the
> abort and if so, prints out diagnostic info.  Unfortunately, an abort still
> occurs.
> 
> Care is taken to read the error registers only when the PCIe bridge is
> active and the PCIe registers are acceptable.  Otherwise, a "die" event
> caused by something other than the PCIe could cause an abort if the PCIe
> "die" handler tried to access registers when the bridge is off.

s/acceptable/accessible/ ?

> Example error output:
>   brcm-pcie 8b20000.pcie: Error: Mem Acc: 32bit, Read, @0x38000000
>   brcm-pcie 8b20000.pcie:  Type: TO=0 Abt=0 UnspReq=1 AccDsble=0 BadAddr=0

Ugly that we have to do this at all, but since I guess it's the best
we can do, looks ok to me.

> Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>
> ---
>  drivers/pci/controller/pcie-brcmstb.c | 155 +++++++++++++++++++++++++-
>  1 file changed, 154 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
> index 400854c893d8..abc56acad1fe 100644
> --- a/drivers/pci/controller/pcie-brcmstb.c
> +++ b/drivers/pci/controller/pcie-brcmstb.c
> @@ -13,15 +13,18 @@
>  #include <linux/ioport.h>
>  #include <linux/irqchip/chained_irq.h>
>  #include <linux/irqdomain.h>
> +#include <linux/kdebug.h>
>  #include <linux/kernel.h>
>  #include <linux/list.h>
>  #include <linux/log2.h>
>  #include <linux/module.h>
>  #include <linux/msi.h>
> +#include <linux/notifier.h>
>  #include <linux/of_address.h>
>  #include <linux/of_irq.h>
>  #include <linux/of_pci.h>
>  #include <linux/of_platform.h>
> +#include <linux/panic_notifier.h>
>  #include <linux/pci.h>
>  #include <linux/pci-ecam.h>
>  #include <linux/printk.h>
> @@ -151,6 +154,39 @@
>  #define  MSI_INT_MASK_SET		0x10
>  #define  MSI_INT_MASK_CLR		0x14
>  
> +/* Error report registers */
> +#define PCIE_OUTB_ERR_TREAT				0x6000
> +#define  PCIE_OUTB_ERR_TREAT_CONFIG_MASK		0x1
> +#define  PCIE_OUTB_ERR_TREAT_MEM_MASK			0x2
> +#define PCIE_OUTB_ERR_VALID				0x6004
> +#define PCIE_OUTB_ERR_CLEAR				0x6008
> +#define PCIE_OUTB_ERR_ACC_INFO				0x600c
> +#define  PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK		0x01
> +#define  PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK		0x02
> +#define  PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK		0x04
> +#define  PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK		0x10

Including "MASK" in these names seems kind of pointless since they're
all single bits.  Some drivers don't bother with "MASK" even for the
multi-bit fields, since uses read pretty naturally without it.  But I
suppose this is following the existing brcmstb style.

> +#define  PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK		0xff00
> +#define PCIE_OUTB_ERR_ACC_ADDR				0x6010
> +#define PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK			0xff00000
> +#define PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK			0xf8000
> +#define PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK		0x7000
> +#define PCIE_OUTB_ERR_ACC_ADDR_REG_MASK			0xfff
> +#define PCIE_OUTB_ERR_CFG_CAUSE				0x6014
> +#define  PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK		0x40
> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK		0x20
> +#define  PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK	0x10
> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK	0x4
> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK	0x2
> +#define  PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK	0x1
> +#define PCIE_OUTB_ERR_MEM_ADDR_LO			0x6018
> +#define PCIE_OUTB_ERR_MEM_ADDR_HI			0x601c
> +#define PCIE_OUTB_ERR_MEM_CAUSE				0x6020
> +#define  PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK		0x40
> +#define  PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK		0x20
> +#define  PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK	0x10
> +#define  PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK	0x2
> +#define  PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK		0x1
> +
>  #define  PCIE_RGR1_SW_INIT_1_PERST_MASK			0x1
>  #define  PCIE_RGR1_SW_INIT_1_PERST_SHIFT		0x0
>  
> @@ -301,6 +337,8 @@ struct brcm_pcie {
>  	struct subdev_regulators *sr;
>  	bool			ep_wakeup_capable;
>  	const struct pcie_cfg_data	*cfg;
> +	struct notifier_block	die_notifier;
> +	struct notifier_block	panic_notifier;
>  	bool			bridge_on;
>  	spinlock_t		bridge_lock;
>  };
> @@ -1711,6 +1749,115 @@ static int brcm_pcie_resume_noirq(struct device *dev)
>  	return ret;
>  }
>  
> +/* Dump out PCIe errors on die or panic */
> +static int _brcm_pcie_dump_err(struct brcm_pcie *pcie,
> +			       const char *type)

Fits on one line.

> +{
> +	void __iomem *base = pcie->base;
> +	int i, is_cfg_err, is_mem_err, lanes;
> +	char *width_str, *direction_str, lanes_str[9];
> +	u32 info, cfg_addr, cfg_cause, mem_cause, lo, hi;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&pcie->bridge_lock, flags);
> +	/* Don't access registers when the bridge is off */
> +	if (!pcie->bridge_on || readl(base + PCIE_OUTB_ERR_VALID) == 0) {
> +		spin_unlock_irqrestore(&pcie->bridge_lock, flags);
> +		return NOTIFY_DONE;
> +	}
> +
> +	/* Read all necessary registers so we can release the spinlock ASAP */
> +	info = readl(base + PCIE_OUTB_ERR_ACC_INFO);
> +	is_cfg_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_CFG_ERR_MASK);
> +	is_mem_err = !!(info & PCIE_OUTB_ERR_ACC_INFO_MEM_ERR_MASK);
> +	if (is_cfg_err) {
> +		cfg_addr = readl(base + PCIE_OUTB_ERR_ACC_ADDR);
> +		cfg_cause = readl(base + PCIE_OUTB_ERR_CFG_CAUSE);
> +	}
> +	if (is_mem_err) {
> +		mem_cause = readl(base + PCIE_OUTB_ERR_MEM_CAUSE);
> +		lo = readl(base + PCIE_OUTB_ERR_MEM_ADDR_LO);
> +		hi = readl(base + PCIE_OUTB_ERR_MEM_ADDR_HI);
> +	}
> +	/* We've got all of the info, clear the error */
> +	writel(1, base + PCIE_OUTB_ERR_CLEAR);
> +	spin_unlock_irqrestore(&pcie->bridge_lock, flags);
> +
> +	dev_err(pcie->dev, "handling %s error notification\n", type);
> +	width_str = (info & PCIE_OUTB_ERR_ACC_INFO_TYPE_64_MASK) ? "64bit" : "32bit";
> +	direction_str = (info & PCIE_OUTB_ERR_ACC_INFO_DIR_WRITE_MASK) ? "Write" : "Read";
> +	lanes = FIELD_GET(PCIE_OUTB_ERR_ACC_INFO_BYTE_LANES_MASK, info);
> +	for (i = 0, lanes_str[8] = 0; i < 8; i++)
> +		lanes_str[i] = (lanes & (1 << i)) ? '1' : '0';
> +
> +	if (is_cfg_err) {
> +		int bus = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_BUS_MASK, cfg_addr);
> +		int dev = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_DEV_MASK, cfg_addr);
> +		int func = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_FUNC_MASK, cfg_addr);
> +		int reg = FIELD_GET(PCIE_OUTB_ERR_ACC_ADDR_REG_MASK, cfg_addr);
> +
> +		dev_err(pcie->dev, "Error: CFG Acc, %s, %s, Bus=%d, Dev=%d, Fun=%d, Reg=0x%x, lanes=%s\n",
> +			width_str, direction_str, bus, dev, func, reg, lanes_str);
> +		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccTO=%d AccDsbld=%d Acc64bit=%d\n",
> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_TIMEOUT_MASK),
> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ABORT_MASK),
> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_UNSUPP_REQ_MASK),
> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_TIMEOUT_MASK),
> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_DISABLED_MASK),
> +			!!(cfg_cause & PCIE_OUTB_ERR_CFG_CAUSE_ACC_64BIT__MASK));
> +	}
> +
> +	if (is_mem_err) {
> +		u64 addr = ((u64)hi << 32) | (u64)lo;
> +
> +		dev_err(pcie->dev, "Error: Mem Acc, %s, %s, @0x%llx, lanes=%s\n",
> +			width_str, direction_str, addr, lanes_str);
> +		dev_err(pcie->dev, " Type: TO=%d Abt=%d UnsupReq=%d AccDsble=%d BadAddr=%d\n",
> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_TIMEOUT_MASK),
> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ABORT_MASK),
> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_UNSUPP_REQ_MASK),
> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_ACC_DISABLED_MASK),
> +			!!(mem_cause & PCIE_OUTB_ERR_MEM_CAUSE_BAD_ADDR_MASK));
> +	}
> +
> +	return NOTIFY_OK;
> +}
> +
> +static int brcm_pcie_die_notify_cb(struct notifier_block *self,
> +				   unsigned long v, void *p)
> +{
> +	struct brcm_pcie *pcie =
> +		container_of(self, struct brcm_pcie, die_notifier);
> +
> +	return _brcm_pcie_dump_err(pcie, "Die");
> +}
> +
> +static int brcm_pcie_panic_notify_cb(struct notifier_block *self,
> +				     unsigned long v, void *p)
> +{
> +	struct brcm_pcie *pcie =
> +		container_of(self, struct brcm_pcie, panic_notifier);
> +
> +	return _brcm_pcie_dump_err(pcie, "Panic");
> +}
> +
> +static void brcm_register_die_notifiers(struct brcm_pcie *pcie)
> +{
> +	pcie->panic_notifier.notifier_call = brcm_pcie_panic_notify_cb;
> +	atomic_notifier_chain_register(&panic_notifier_list,
> +				       &pcie->panic_notifier);
> +
> +	pcie->die_notifier.notifier_call = brcm_pcie_die_notify_cb;
> +	register_die_notifier(&pcie->die_notifier);
> +}
> +
> +static void brcm_unregister_die_notifiers(struct brcm_pcie *pcie)
> +{
> +	unregister_die_notifier(&pcie->die_notifier);
> +	atomic_notifier_chain_unregister(&panic_notifier_list,
> +					 &pcie->panic_notifier);
> +}
> +
>  static void __brcm_pcie_remove(struct brcm_pcie *pcie)
>  {
>  	brcm_msi_remove(pcie);
> @@ -1729,6 +1876,9 @@ static void brcm_pcie_remove(struct platform_device *pdev)
>  
>  	pci_stop_root_bus(bridge->bus);
>  	pci_remove_root_bus(bridge->bus);
> +	if (pcie->cfg->has_err_report)
> +		brcm_unregister_die_notifiers(pcie);
> +
>  	__brcm_pcie_remove(pcie);
>  }
>  
> @@ -1829,6 +1979,7 @@ static const struct pcie_cfg_data bcm7216_cfg = {
>  	.bridge_sw_init_set = brcm_pcie_bridge_sw_init_set_7278,
>  	.has_phy	= true,
>  	.num_inbound_wins = 3,
> +	.has_err_report = true,
>  };
>  
>  static const struct pcie_cfg_data bcm7712_cfg = {
> @@ -2003,8 +2154,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
>  		return ret;
>  	}
>  
> -	if (pcie->cfg->has_err_report)
> +	if (pcie->cfg->has_err_report) {
>  		spin_lock_init(&pcie->bridge_lock);
> +		brcm_register_die_notifiers(pcie);
> +	}
>  
>  	return 0;
>  
> -- 
> 2.34.1
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-06 18:50       ` Bjorn Helgaas
@ 2025-08-06 19:16         ` Jim Quinlan
  2025-08-06 20:41         ` Florian Fainelli
  1 sibling, 0 replies; 16+ messages in thread
From: Jim Quinlan @ 2025-08-06 19:16 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

[-- Attachment #1: Type: text/plain, Size: 3922 bytes --]

On Wed, Aug 6, 2025 at 2:50 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Wed, Aug 06, 2025 at 02:38:12PM -0400, Jim Quinlan wrote:
> > On Wed, Aug 6, 2025 at 2:15 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > >
> > > On Fri, Jun 13, 2025 at 06:08:43PM -0400, Jim Quinlan wrote:
> > > > Whereas most PCIe HW returns 0xffffffff on illegal accesses and the like,
> > > > by default Broadcom's STB PCIe controller effects an abort.  Some SoCs --
> > > > 7216 and its descendants -- have new HW that identifies error details.
> > >
> > > What's the long term plan for this?  This abort is a huge problem that
> > > we're seeing across arm64 platforms.  Forcing a panic and reboot for
> > > every uncorrectable error is pretty hard to deal with.
> >
> > Are you referring to STB/CM systems, Rpi, or something else altogether?
>
> Just in general.  I saw this recently with a Nuvoton NPCM8xx PCIe
> controller.  I'm not an arm64 guy, but I've been told that these
> aborts are basically unrecoverable from a kernel perspective.  For
> some reason several PCIe controllers intended for arm64 seem to raise
> aborts on PCIe errors.  At the moment, that means we can't recover
> from errors like surprise unplugs and other things that *should* be
> recoverable (perhaps at the cost of resetting or disabling a PCIe
> device).
FWIW, our original RC controller was paired with MIPs, so it could be
that a number of non-x86 camps just went with the panic-y behavior.

I believe that the PCIe spec allows this rude behavior, or doesn't
specifically disallow it.  I also remember that there is an ARM
standard initiative for ARM-based systems that requires the PCIe
error-gets-0xffffffff behavior.  We obviously don't conform.   At any
rate, I will send an email now to the HW folks I know to remind them
that we need this behavior, at least as a configurable option.

Regards,
Jim Quinlan
Broadcom STB/CM
>
> > > Is there a plan to someday recover from these aborts?  Or change the
> > > hardware so it can at least be configured to return ~0 data after
> > > logging the error in the hardware registers?
> >
> > Some of our upcoming chips will have the ability to do nothing on
> > errant PCIe writes and return 0xffffffff on errant PCIe reads.   But
> > none of our STB/CM chips do this currently.   I've been asking for
> > this behavior for years but I have limited influence on what happens
> > in HW.
>
> Fingers crossed for either that or some other way to make these things
> recoverable.
>
> > > > This simple handler determines if the PCIe controller was the
> > > > cause of the abort and if so, prints out diagnostic info.
> > > > Unfortunately, an abort still occurs.
> > > >
> > > > Care is taken to read the error registers only when the PCIe
> > > > bridge is active and the PCIe registers are acceptable.
> > > > Otherwise, a "die" event caused by something other than the PCIe
> > > > could cause an abort if the PCIe "die" handler tried to access
> > > > registers when the bridge is off.
> > >
> > > Checking whether the bridge is active is a "mostly-works"
> > > situation since it's always racy.
> >
> > I'm not sure I understand the "racy" comment.  If the PCIe bridge is
> > off, we do not read the PCIe error registers.  In this case, PCIe is
> > probably not the cause of the panic.   In the rare case the PCIe
> > bridge is off  and it was the PCIe that caused the panic, nothing
> > gets reported, and this is where we are without this commit.
> > Perhaps this is what you mean by "mostly-works".  But this is the
> > best that can be done with SW given our HW.
>
> Right, my fault.  The error report registers don't look like standard
> PCIe things, so I suppose they are on the host side, not the PCIe
> side, so they're probably guaranteed to be accessible and non-racy
> unless the bridge is in reset.
>
> Bjorn

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4197 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-06 18:50       ` Bjorn Helgaas
  2025-08-06 19:16         ` Jim Quinlan
@ 2025-08-06 20:41         ` Florian Fainelli
  2025-08-07  5:26           ` Manivannan Sadhasivam
  1 sibling, 1 reply; 16+ messages in thread
From: Florian Fainelli @ 2025-08-06 20:41 UTC (permalink / raw)
  To: Bjorn Helgaas, Jim Quinlan
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Lorenzo Pieralisi, Krzysztof Wilczyński,
	Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On 8/6/25 11:50, Bjorn Helgaas wrote:
>> I'm not sure I understand the "racy" comment.  If the PCIe bridge is
>> off, we do not read the PCIe error registers.  In this case, PCIe is
>> probably not the cause of the panic.   In the rare case the PCIe
>> bridge is off  and it was the PCIe that caused the panic, nothing
>> gets reported, and this is where we are without this commit.
>> Perhaps this is what you mean by "mostly-works".  But this is the
>> best that can be done with SW given our HW.
> 
> Right, my fault.  The error report registers don't look like standard
> PCIe things, so I suppose they are on the host side, not the PCIe
> side, so they're probably guaranteed to be accessible and non-racy
> unless the bridge is in reset.

To expand upon that part, the situation that I ran in we had the PCIe 
link down and therefore clock gated the PCIe root complex hardware to 
conserve power. Eventually I did hit a voluntary panic, and since all 
panic notifiers registered are invoked in succession, the one registered 
for the PCIe RC was invoked as well and accessing clock gated registers 
would not work and trigger another fault which would be confusing and 
mingle with the panic I was trying to debug initially. Hence this check, 
and a clock gated PCIe RC would not be logging any errors anyway.
-- 
Florian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-06 20:41         ` Florian Fainelli
@ 2025-08-07  5:26           ` Manivannan Sadhasivam
  2025-08-07 14:40             ` Jim Quinlan
  2025-08-07 17:00             ` Florian Fainelli
  0 siblings, 2 replies; 16+ messages in thread
From: Manivannan Sadhasivam @ 2025-08-07  5:26 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: Bjorn Helgaas, Jim Quinlan, linux-pci, Nicolas Saenz Julienne,
	Bjorn Helgaas, Lorenzo Pieralisi, Cyril Brulebois,
	bcm-kernel-feedback-list, jim2101024, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On Wed, Aug 06, 2025 at 01:41:35PM GMT, Florian Fainelli wrote:
> On 8/6/25 11:50, Bjorn Helgaas wrote:
> > > I'm not sure I understand the "racy" comment.  If the PCIe bridge is
> > > off, we do not read the PCIe error registers.  In this case, PCIe is
> > > probably not the cause of the panic.   In the rare case the PCIe
> > > bridge is off  and it was the PCIe that caused the panic, nothing
> > > gets reported, and this is where we are without this commit.
> > > Perhaps this is what you mean by "mostly-works".  But this is the
> > > best that can be done with SW given our HW.
> > 
> > Right, my fault.  The error report registers don't look like standard
> > PCIe things, so I suppose they are on the host side, not the PCIe
> > side, so they're probably guaranteed to be accessible and non-racy
> > unless the bridge is in reset.
> 
> To expand upon that part, the situation that I ran in we had the PCIe link
> down and therefore clock gated the PCIe root complex hardware to conserve
> power. Eventually I did hit a voluntary panic, and since all panic notifiers
> registered are invoked in succession, the one registered for the PCIe RC was
> invoked as well and accessing clock gated registers would not work and
> trigger another fault which would be confusing and mingle with the panic I
> was trying to debug initially. Hence this check, and a clock gated PCIe RC
> would not be logging any errors anyway.

May I ask how you are recovering from link down? Can the driver detect link down
using any platform IRQ?

- Mani

-- 
மணிவண்ணன் சதாசிவம்

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-07  5:26           ` Manivannan Sadhasivam
@ 2025-08-07 14:40             ` Jim Quinlan
  2025-08-07 17:00             ` Florian Fainelli
  1 sibling, 0 replies; 16+ messages in thread
From: Jim Quinlan @ 2025-08-07 14:40 UTC (permalink / raw)
  To: Manivannan Sadhasivam
  Cc: Florian Fainelli, Bjorn Helgaas, linux-pci,
	Nicolas Saenz Julienne, Bjorn Helgaas, Lorenzo Pieralisi,
	Cyril Brulebois, bcm-kernel-feedback-list, jim2101024,
	Lorenzo Pieralisi, Krzysztof Wilczyński, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

[-- Attachment #1: Type: text/plain, Size: 2306 bytes --]

On Thu, Aug 7, 2025 at 1:26 AM Manivannan Sadhasivam <mani@kernel.org> wrote:
>
> On Wed, Aug 06, 2025 at 01:41:35PM GMT, Florian Fainelli wrote:
> > On 8/6/25 11:50, Bjorn Helgaas wrote:
> > > > I'm not sure I understand the "racy" comment.  If the PCIe bridge is
> > > > off, we do not read the PCIe error registers.  In this case, PCIe is
> > > > probably not the cause of the panic.   In the rare case the PCIe
> > > > bridge is off  and it was the PCIe that caused the panic, nothing
> > > > gets reported, and this is where we are without this commit.
> > > > Perhaps this is what you mean by "mostly-works".  But this is the
> > > > best that can be done with SW given our HW.
> > >
> > > Right, my fault.  The error report registers don't look like standard
> > > PCIe things, so I suppose they are on the host side, not the PCIe
> > > side, so they're probably guaranteed to be accessible and non-racy
> > > unless the bridge is in reset.
> >
> > To expand upon that part, the situation that I ran in we had the PCIe link
> > down and therefore clock gated the PCIe root complex hardware to conserve
> > power. Eventually I did hit a voluntary panic, and since all panic notifiers
> > registered are invoked in succession, the one registered for the PCIe RC was
> > invoked as well and accessing clock gated registers would not work and
> > trigger another fault which would be confusing and mingle with the panic I
> > was trying to debug initially. Hence this check, and a clock gated PCIe RC
> > would not be logging any errors anyway.
>
> May I ask how you are recovering from link down? Can the driver detect link down
> using any platform IRQ?

We do have link up/down interrupts on most of our SoCs but we once
implemented a handler and the interrupts were unreliable.  We informed
HW but I do not think they implemented any changes.  We will try again
at some point to ascertain the extent of the issue.

AFAICT such a handler is not a panacea.  Having a link-down handler
may be able to immediately prevent panics for config space accesses by
intercepting them but not incoming memory accesses from the host or
endpoint device.

Regards,
Jim Quinlan
Broadcom STB/CM
>
> - Mani
>
> --
> மணிவண்ணன் சதாசிவம்

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4197 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver
  2025-08-07  5:26           ` Manivannan Sadhasivam
  2025-08-07 14:40             ` Jim Quinlan
@ 2025-08-07 17:00             ` Florian Fainelli
  1 sibling, 0 replies; 16+ messages in thread
From: Florian Fainelli @ 2025-08-07 17:00 UTC (permalink / raw)
  To: Manivannan Sadhasivam
  Cc: Bjorn Helgaas, Jim Quinlan, linux-pci, Nicolas Saenz Julienne,
	Bjorn Helgaas, Lorenzo Pieralisi, Cyril Brulebois,
	bcm-kernel-feedback-list, jim2101024, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

On 8/6/25 22:26, Manivannan Sadhasivam wrote:
> On Wed, Aug 06, 2025 at 01:41:35PM GMT, Florian Fainelli wrote:
>> On 8/6/25 11:50, Bjorn Helgaas wrote:
>>>> I'm not sure I understand the "racy" comment.  If the PCIe bridge is
>>>> off, we do not read the PCIe error registers.  In this case, PCIe is
>>>> probably not the cause of the panic.   In the rare case the PCIe
>>>> bridge is off  and it was the PCIe that caused the panic, nothing
>>>> gets reported, and this is where we are without this commit.
>>>> Perhaps this is what you mean by "mostly-works".  But this is the
>>>> best that can be done with SW given our HW.
>>>
>>> Right, my fault.  The error report registers don't look like standard
>>> PCIe things, so I suppose they are on the host side, not the PCIe
>>> side, so they're probably guaranteed to be accessible and non-racy
>>> unless the bridge is in reset.
>>
>> To expand upon that part, the situation that I ran in we had the PCIe link
>> down and therefore clock gated the PCIe root complex hardware to conserve
>> power. Eventually I did hit a voluntary panic, and since all panic notifiers
>> registered are invoked in succession, the one registered for the PCIe RC was
>> invoked as well and accessing clock gated registers would not work and
>> trigger another fault which would be confusing and mingle with the panic I
>> was trying to debug initially. Hence this check, and a clock gated PCIe RC
>> would not be logging any errors anyway.
> 
> May I ask how you are recovering from link down? Can the driver detect link down
> using any platform IRQ?

Just to be clear, what I was describing here is not a link down 
recovery. The point I was trying to convey is that we have multiple 
busses in our system (DRAM, on-chip registers, PCIe) and each one of 
them has its own way of reporting errors, so if we get a form of system 
error/kernel panic we like to interrogate each one of them to figure out 
the cause. In the case I was describing, I was actually tracking down a 
bad DRAM access, but the error reporting came from the on-chip register 
arbiter because prior to that we had been trying to read from the clock 
gated PCIe bridge whether the PCIe bridge was responsible for the bad 
access. This leads you to an incorrect source of the bad access, and so 
that's why we guard the panic handler invocation within the PCIe root 
complex with a check whether the bridge is in reset or not.

If this is still not clear, let me know.
-- 
Florian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active
  2025-08-06 19:14   ` Bjorn Helgaas
@ 2025-08-07 18:03     ` Jim Quinlan
  0 siblings, 0 replies; 16+ messages in thread
From: Jim Quinlan @ 2025-08-07 18:03 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: linux-pci, Nicolas Saenz Julienne, Bjorn Helgaas,
	Lorenzo Pieralisi, Cyril Brulebois, bcm-kernel-feedback-list,
	jim2101024, Florian Fainelli, Lorenzo Pieralisi,
	Krzysztof Wilczyński, Manivannan Sadhasivam, Rob Herring,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	moderated list:BROADCOM BCM2711/BCM2835 ARM ARCHITECTURE,
	open list

[-- Attachment #1: Type: text/plain, Size: 6533 bytes --]

On Wed, Aug 6, 2025 at 3:14 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> On Fri, Jun 13, 2025 at 06:08:42PM -0400, Jim Quinlan wrote:
> > In a future commit, a new handler will be introduced that in part does
> > reads and writes to some of the PCIe registers.  When this handler is
> > invoked, it is paramount that it does not do these register accesses when
> > the PCIe bridge is inactive, as this will cause CPU abort errors.
> >
> > To solve this we keep a spinlock that guards a variable which indicates
> > whether the bridge is on or off.  When the bridge is on, access of the PCIe
> > HW registers may proceed.
> >
> > Since there are multiple ways to reset the bridge, we introduce a general
> > function to obtain the spinlock, call the specific function that is used
> > for the specific SoC, sets the bridge active indicator variable, and
> > releases the spinlock.
> >
> > Signed-off-by: Jim Quinlan <james.quinlan@broadcom.com>
> > ---
> >  drivers/pci/controller/pcie-brcmstb.c | 40 +++++++++++++++++++++++----
> >  1 file changed, 35 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c
> > index 92887b394eb4..400854c893d8 100644
> > --- a/drivers/pci/controller/pcie-brcmstb.c
> > +++ b/drivers/pci/controller/pcie-brcmstb.c
> > @@ -29,6 +29,7 @@
> >  #include <linux/reset.h>
> >  #include <linux/sizes.h>
> >  #include <linux/slab.h>
> > +#include <linux/spinlock.h>
> >  #include <linux/string.h>
> >  #include <linux/types.h>
> >
> > @@ -254,6 +255,7 @@ struct pcie_cfg_data {
> >       int (*perst_set)(struct brcm_pcie *pcie, u32 val);
> >       int (*bridge_sw_init_set)(struct brcm_pcie *pcie, u32 val);
> >       int (*post_setup)(struct brcm_pcie *pcie);
> > +     bool has_err_report;
>
> It doesn't look worth it to me to add this.  It only avoids locking in
> a non-performance path.
>
> >  };
> >
> >  struct subdev_regulators {
> > @@ -299,6 +301,8 @@ struct brcm_pcie {
> >       struct subdev_regulators *sr;
> >       bool                    ep_wakeup_capable;
> >       const struct pcie_cfg_data      *cfg;
> > +     bool                    bridge_on;
> > +     spinlock_t              bridge_lock;
> >  };
> >
> >  static inline bool is_bmips(const struct brcm_pcie *pcie)
> > @@ -306,6 +310,24 @@ static inline bool is_bmips(const struct brcm_pcie *pcie)
> >       return pcie->cfg->soc_base == BCM7435 || pcie->cfg->soc_base == BCM7425;
> >  }
> >
> > +static inline int brcm_pcie_bridge_sw_init_set(struct brcm_pcie *pcie, u32 val)
> > +{
> > +     unsigned long flags;
> > +     int ret;
> > +
> > +     if (pcie->cfg->has_err_report)
> > +             spin_lock_irqsave(&pcie->bridge_lock, flags);
> > +
> > +     ret = pcie->cfg->bridge_sw_init_set(pcie, val);
> > +     if (ret)
> > +             pcie->bridge_on = !val;
>
> AFAICT, .bridge_sw_init_set(1) asserts reset, .bridge_sw_init_set(0)
> deasserts reset, and it returns 0 for success, so I'm confused about
> this.  If either assert or deassert failed (ret != 0), I guess we
> don't know the state of the bridge and can't assume it's active, so I
> would have expected something like:
>
>   ret = pcie->cfg->bridge_sw_init_set(pcie, val);
>   if (ret)
>     pcie->bridge_on = false;
>   else
>     pcie->bridge_on = !val;
Ack

>
> Tangent: the last "return ret" in brcm_pcie_bridge_sw_init_set_generic()
> should be "return 0" and drop the unnecessary initialization of "ret".
Ack
>
> And the code there would be vastly improved by using FIELD_PREP() or
> u32p_replace_bits() and getting rid of the shifting.
Ack

>
> > +     if (pcie->cfg->has_err_report)
> > +             spin_unlock_irqrestore(&pcie->bridge_lock, flags);
> > +
> > +     return ret;
> > +}
> > +
> >  /*
> >   * This is to convert the size of the inbound "BAR" region to the
> >   * non-linear values of PCIE_X_MISC_RC_BAR[123]_CONFIG_LO.SIZE
> > @@ -1078,7 +1100,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
> >       int memc, ret;
> >
> >       /* Reset the bridge */
> > -     ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
> > +     ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
> >       if (ret)
> >               return ret;
> >
> > @@ -1094,7 +1116,7 @@ static int brcm_pcie_setup(struct brcm_pcie *pcie)
> >       usleep_range(100, 200);
> >
> >       /* Take the bridge out of reset */
> > -     ret = pcie->cfg->bridge_sw_init_set(pcie, 0);
> > +     ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
> >       if (ret)
> >               return ret;
> >
> > @@ -1545,7 +1567,7 @@ static int brcm_pcie_turn_off(struct brcm_pcie *pcie)
> >
> >       if (!(pcie->cfg->quirks & CFG_QUIRK_AVOID_BRIDGE_SHUTDOWN))
> >               /* Shutdown PCIe bridge */
> > -             ret = pcie->cfg->bridge_sw_init_set(pcie, 1);
> > +             ret = brcm_pcie_bridge_sw_init_set(pcie, 1);
> >
> >       return ret;
> >  }
> > @@ -1633,7 +1655,9 @@ static int brcm_pcie_resume_noirq(struct device *dev)
> >               goto err_reset;
> >
> >       /* Take bridge out of reset so we can access the SERDES reg */
> > -     pcie->cfg->bridge_sw_init_set(pcie, 0);
> > +     ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
> > +     if (ret)
> > +             goto err_reset;
> >
> >       /* SERDES_IDDQ = 0 */
> >       tmp = readl(base + HARD_DEBUG(pcie));
> > @@ -1901,7 +1925,10 @@ static int brcm_pcie_probe(struct platform_device *pdev)
> >       if (ret)
> >               return dev_err_probe(&pdev->dev, ret, "could not enable clock\n");
> >
> > -     pcie->cfg->bridge_sw_init_set(pcie, 0);
> > +     ret = brcm_pcie_bridge_sw_init_set(pcie, 0);
> > +     if (ret)
> > +             return dev_err_probe(&pdev->dev, ret,
> > +                                  "could not un-reset the bridge\n");
>
> "un-reset" doesn't mean anything to me.  Is this the same as "could
> not take the bridge out of reset"?  Or maybe "could not deassert
> bridge reset"?
Ack

Thanks,
Jim Quinlan
Broadcom STB/CM

>
> >       if (pcie->swinit_reset) {
> >               ret = reset_control_assert(pcie->swinit_reset);
> > @@ -1976,6 +2003,9 @@ static int brcm_pcie_probe(struct platform_device *pdev)
> >               return ret;
> >       }
> >
> > +     if (pcie->cfg->has_err_report)
> > +             spin_lock_init(&pcie->bridge_lock);
> > +
> >       return 0;
> >
> >  fail:
> > --
> > 2.34.1
> >

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4197 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2025-08-07 18:03 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-13 22:08 [PATCH 0/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
2025-06-13 22:08 ` [PATCH 1/2] PCI: brcmstb: Add a way to indicate if PCIe bridge is active Jim Quinlan
2025-06-13 23:23   ` Florian Fainelli
2025-08-06 19:14   ` Bjorn Helgaas
2025-08-07 18:03     ` Jim Quinlan
2025-06-13 22:08 ` [PATCH 2/2] PCI: brcmstb: Add panic/die handler to driver Jim Quinlan
2025-06-13 23:28   ` Florian Fainelli
2025-08-06 18:15   ` Bjorn Helgaas
2025-08-06 18:38     ` Jim Quinlan
2025-08-06 18:50       ` Bjorn Helgaas
2025-08-06 19:16         ` Jim Quinlan
2025-08-06 20:41         ` Florian Fainelli
2025-08-07  5:26           ` Manivannan Sadhasivam
2025-08-07 14:40             ` Jim Quinlan
2025-08-07 17:00             ` Florian Fainelli
2025-08-06 19:14   ` Bjorn Helgaas

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).