* [PATCH v3] wifi: mt76: mt7921: Add PCIe AER handler support to prevent system crash
@ 2026-05-06 7:04 JB Tsai
0 siblings, 0 replies; only message in thread
From: JB Tsai @ 2026-05-06 7:04 UTC (permalink / raw)
To: nbd, lorenzo
Cc: linux-wireless, linux-mediatek, Deren.Wu, Sean.Wang, Quan.Zhou,
Ryder.Lee, Leon.Yen, litien.chang, eason.lai, jb.tsai, Eason Lai,
Sean Wang, Jeff Hsu, Michael Lo
From: Eason Lai <Eason.Lai@mediatek.com>
When an AER error occurs and the bus is hung, the register reads return
0xFFFFFFFF, causing the DMA queue state to be corrupted and resulting in
an invalid memory access when accessing q->desc[] or q->entry[].
Unable to handle kernel paging request at virtual address
ffffffc01099eac0
pc : mt76_dma_add_buf+0x124/0x188 [mt76]
lr : mt76_dma_rx_fill+0x11c/0x1d8 [mt76]
sp : ffffffc016d9bbf0
x29: ffffffc016d9bc10 x28: 0000000000000000
x27: 0000000000000000 x26: ffffffb7855e50b8
x25: ffffffb80d04f000 x24: 0000000000000000
x23: 0000000000000ec0 x22: ffffffb796803648
x21: ffffffb796801f80 x20: ffffffb7968035f8
x19: 0000000000000ec0 x18: 0000000000000000
x17: 000000004ec00000 x16: 000000000ec00000
x15: ffffffc01099eac0 x14: 000000004ec00000
x13: 00000000ffc5a000 x12: ffffffc016d9bc32
x11: 00000000ffffffff x10: 0000000000000002
x9 : 0000000000000000 x8 : 000000000000b4ac
x7 : 0000000000000a20 x6 : ffffffb6c1806400
x5 : 0000000000000000 x4 : ffffffb80d04f000
x3 : 0000000000000000 x2 : 0000000000000001
x1 : 000000000ec04000 x0 : ffffffb7968035f8
Call trace:
mt76_dma_add_buf+0x124/0x188 [mt76 (HASH:1029 4)]
mt76_dma_rx_reset+0xe8/0xfc [mt76 (HASH:1029 4)]
mt7921_wpdma_reset+0x188/0x1b0 [mt7921e (HASH:ee48 5)]
mt7921e_mac_reset+0x128/0x418 [mt7921e (HASH:ee48 5)]
mt7921_mac_reset_work+0xac/0x1a8 [mt7921_common (HASH:f721 6)]
process_one_work+0x188/0x514
worker_thread+0x12c/0x300
kthread+0x140/0x1fc
ret_from_fork+0x10/0x30
Fix this by checking the bus_hung flag in mt7921_mac_reset_work() before
attempting the reset sequence, and by installing no-op bus operations
when an unrecoverable AER error is detected, preventing further invalid
hardware accesses.
Due to hardware limitations - such as the lack of a connected hardware
reset pin or the absence of host re-probe functionality - affected Wi-Fi
devices may not fully recover to a normal operational state after
certain errors, even with AER enabled.
Fixes: 17f1de56df05 ("mt76: add common code shared between multiple chipsets")
Co-developed-by: Sean Wang <sean.wang@mediatek.com>
Signed-off-by: Sean Wang <sean.wang@mediatek.com>
Co-developed-by: Jeff Hsu <jeff.hsu@mediatek.com>
Signed-off-by: Jeff Hsu <jeff.hsu@mediatek.com>
Signed-off-by: Eason Lai <Eason.Lai@mediatek.com>
Co-developed-by: Michael Lo <michael.lo@mediatek.com>
---
v2: remove the timestamp of error call trace
v3: fix bus hung and causing invalid access issue
---
drivers/net/wireless/mediatek/mt76/dma.c | 10 +-
drivers/net/wireless/mediatek/mt76/dma.h | 16 ++-
drivers/net/wireless/mediatek/mt76/mcu.c | 12 +-
.../net/wireless/mediatek/mt76/mt76_connac.h | 5 +
.../net/wireless/mediatek/mt76/mt7921/mac.c | 3 +
.../net/wireless/mediatek/mt76/mt7921/pci.c | 103 ++++++++++++++++++
6 files changed, 140 insertions(+), 9 deletions(-)
diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c
index f8c2fe5f2f58..ec84961b11db 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.c
+++ b/drivers/net/wireless/mediatek/mt76/dma.c
@@ -189,6 +189,8 @@ mt76_dma_queue_magic_cnt_init(struct mt76_dev *dev, struct mt76_queue *q)
static void
mt76_dma_sync_idx(struct mt76_dev *dev, struct mt76_queue *q)
{
+ int last = 0;
+
if ((q->flags & MT_QFLAG_WED_RRO_EN) &&
(!is_mt7992(dev) || !mt76_npu_device_active(dev)))
Q_WRITE(q, ring_size, MT_DMA_RRO_EN | q->ndesc);
@@ -201,7 +203,9 @@ mt76_dma_sync_idx(struct mt76_dev *dev, struct mt76_queue *q)
}
Q_WRITE(q, desc_base, q->desc_dma);
- q->head = Q_READ(q, dma_idx);
+
+ last = Q_READ(q, dma_idx);
+ q->head = (last >= 0 && last < q->ndesc) ? last : 0;
q->tail = q->head;
}
@@ -625,8 +629,8 @@ mt76_dma_tx_queue_skb_raw(struct mt76_dev *dev, struct mt76_queue *q,
buf.len = skb->len;
spin_lock_bh(&q->lock);
- mt76_dma_add_buf(dev, q, &buf, 1, tx_info, skb, NULL);
- mt76_dma_kick_queue(dev, q);
+ if (mt76_dma_add_buf(dev, q, &buf, 1, tx_info, skb, NULL) >= 0)
+ mt76_dma_kick_queue(dev, q);
spin_unlock_bh(&q->lock);
return 0;
diff --git a/drivers/net/wireless/mediatek/mt76/dma.h b/drivers/net/wireless/mediatek/mt76/dma.h
index 2a0226c83f3c..7f395a4c509f 100644
--- a/drivers/net/wireless/mediatek/mt76/dma.h
+++ b/drivers/net/wireless/mediatek/mt76/dma.h
@@ -108,8 +108,20 @@
#else
-#define Q_READ(_q, _field) readl(&(_q)->regs->_field)
-#define Q_WRITE(_q, _field, _val) writel(_val, &(_q)->regs->_field)
+#define Q_READ(_q, _field) ({ \
+ u32 _val; \
+ if (unlikely(atomic_read(&(_q)->dev->bus_hung))) \
+ _val = 0; \
+ else \
+ _val = readl(&(_q)->regs->(_field)); \
+ _val; \
+})
+
+#define Q_WRITE(_q, _field, _val) do { \
+ if (unlikely(atomic_read(&(_q)->dev->bus_hung))) \
+ break; \
+ writel(_val, &(_q)->regs->(_field)); \
+} while (0)
#endif
diff --git a/drivers/net/wireless/mediatek/mt76/mcu.c b/drivers/net/wireless/mediatek/mt76/mcu.c
index cbfb3bbec503..7149b2f7aafd 100644
--- a/drivers/net/wireless/mediatek/mt76/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mcu.c
@@ -78,15 +78,19 @@ int mt76_mcu_skb_send_and_get_msg(struct mt76_dev *dev, struct sk_buff *skb,
unsigned long expires;
int ret, seq;
- if (mt76_is_sdio(dev))
- if (test_bit(MT76_RESET, &dev->phy.state) && atomic_read(&dev->bus_hung))
- return -EIO;
-
if (ret_skb)
*ret_skb = NULL;
mutex_lock(&dev->mcu.mutex);
+ if ((mt76_is_mmio(dev) && atomic_read(&dev->bus_hung)) ||
+ (mt76_is_sdio(dev) && test_bit(MT76_RESET, &dev->phy.state) &&
+ atomic_read(&dev->bus_hung))) {
+ orig_skb = skb;
+ ret = -EIO;
+ goto out;
+ }
+
if (dev->mcu_ops->mcu_skb_prepare_msg) {
orig_skb = skb;
ret = dev->mcu_ops->mcu_skb_prepare_msg(dev, skb, cmd, &seq);
diff --git a/drivers/net/wireless/mediatek/mt76/mt76_connac.h b/drivers/net/wireless/mediatek/mt76/mt76_connac.h
index 51423c7740bd..17f9c6e65849 100644
--- a/drivers/net/wireless/mediatek/mt76/mt76_connac.h
+++ b/drivers/net/wireless/mediatek/mt76/mt76_connac.h
@@ -48,6 +48,11 @@ enum rx_pkt_type {
#define MT_TXD_LEN_MSDU_LAST BIT(14)
#define MT_TXD_LEN_AMSDU_LAST BIT(15)
+/* PCIE part */
+#define PCIE_AER_UNC_STATUS_OFFSET 0x204
+#define PCIE_AER_UNC_MASK_OFFSET 0x208
+#define PCIE_AER_CO_STATUS_OFFSET 0x210
+
enum {
CMD_CBW_20MHZ = IEEE80211_STA_RX_BW_20,
CMD_CBW_40MHZ = IEEE80211_STA_RX_BW_40,
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
index 03b4960db73f..30f431896b1c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/mac.c
@@ -668,6 +668,9 @@ void mt7921_mac_reset_work(struct work_struct *work)
cancel_work_sync(&pm->wake_work);
for (i = 0; i < 10; i++) {
+ if (atomic_read(&dev->mt76.bus_hung))
+ return;
+
mutex_lock(&dev->mt76.mutex);
ret = mt792x_dev_reset(dev);
mutex_unlock(&dev->mt76.mutex);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
index 7a790ddf43bb..6b9d267a7e7b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7921/pci.c
@@ -594,6 +594,108 @@ static int mt7921_pci_resume(struct device *device)
return err;
}
+static u32 mt7921_aer_rr(struct mt76_dev *mdev, u32 offset)
+{
+ return 0;
+}
+
+static void mt7921_aer_wr(struct mt76_dev *mdev, u32 offset, u32 val)
+{
+ ;
+}
+
+static u32 mt791_aer_rmw(struct mt76_dev *mdev, u32 offset, u32 mask, u32 val)
+{
+ return 0;
+}
+
+static const struct mt76_bus_ops mt7921_aer_bus_hung_ops = {
+ .rr = mt7921_aer_rr,
+ .wr = mt7921_aer_wr,
+ .rmw = mt791_aer_rmw,
+ .type = MT76_BUS_MMIO
+};
+
+static void mt7921_pci_set_aer_bus_hung_ops(struct mt792x_dev *dev)
+{
+ if (READ_ONCE(dev->mt76.bus) == &mt7921_aer_bus_hung_ops)
+ return;
+
+ atomic_set(&dev->mt76.bus_hung, true);
+ WRITE_ONCE(dev->mt76.bus, &mt7921_aer_bus_hung_ops);
+}
+
+static pci_ers_result_t mt7921_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct mt76_dev *mdev = pci_get_drvdata(pdev);
+ struct mt792x_dev *dev = container_of(mdev, struct mt792x_dev, mt76);
+ u32 aer_unc_val = 0, aer_co_val = 0;
+
+ dev_err(mdev->dev, "PCIE error detect state: %d\n", state);
+
+ /* Clear SW IRQ tasklet first */
+ tasklet_kill(&mdev->irq_tasklet);
+
+ if (state == pci_channel_io_perm_failure) {
+ mt7921_pci_set_aer_bus_hung_ops(dev);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ pci_read_config_dword(pdev, PCIE_AER_UNC_STATUS_OFFSET, &aer_unc_val);
+ pci_read_config_dword(pdev, PCIE_AER_CO_STATUS_OFFSET, &aer_co_val);
+
+ dev_warn(mdev->dev, "PCIE_AER_UNC_STATUS_OFFSET: 0x%x\n", aer_unc_val);
+ dev_warn(mdev->dev, "PCIE_AER_CO_STATUS_OFFSET: 0x%x\n", aer_co_val);
+
+ /**
+ * Due to this error is from link error and this AER is un-correctable,
+ * so can't covered by device
+ **/
+ if (aer_unc_val != 0) {
+ mt7921_pci_set_aer_bus_hung_ops(dev);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ /**
+ * Try to recover it when state is pci_channel_io_frozen or
+ * AER is correctable error
+ **/
+ if (state == pci_channel_io_frozen || aer_co_val != 0) {
+ /* Disable PCIE activity first. */
+ pci_disable_device(pdev);
+ return PCI_ERS_RESULT_NEED_RESET;
+ }
+
+ return PCI_ERS_RESULT_NONE;
+}
+
+static pci_ers_result_t mt7921_slot_reset(struct pci_dev *pdev)
+{
+ struct mt76_dev *mdev = pci_get_drvdata(pdev);
+ int ret = 0;
+
+ ret = pci_enable_device_mem(pdev);
+
+ if (ret) {
+ dev_err(mdev->dev, "pci_enable_device_mem failed: %d\n", ret);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ pci_set_master(pdev);
+ pci_restore_state(pdev);
+ pci_save_state(pdev);
+ /* Also try do the vendor reset to let it more clear. */
+ mt792x_reset(mdev);
+
+ return PCI_ERS_RESULT_RECOVERED;
+}
+
+static const struct pci_error_handlers mt7921_err_handler = {
+ .error_detected = mt7921_error_detected,
+ .slot_reset = mt7921_slot_reset,
+};
+
static void mt7921_pci_shutdown(struct pci_dev *pdev)
{
mt7921_pci_remove(pdev);
@@ -608,6 +710,7 @@ static struct pci_driver mt7921_pci_driver = {
.remove = mt7921_pci_remove,
.shutdown = mt7921_pci_shutdown,
.driver.pm = pm_sleep_ptr(&mt7921_pm_ops),
+ .err_handler = &mt7921_err_handler,
};
module_pci_driver(mt7921_pci_driver);
--
2.45.2
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2026-05-06 7:05 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-06 7:04 [PATCH v3] wifi: mt76: mt7921: Add PCIe AER handler support to prevent system crash JB Tsai
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox