From: Jiawen Wu <jiawenwu@trustnetic.com>
To: netdev@vger.kernel.org
Cc: "Mengyuan Lou" <mengyuanlou@net-swift.com>,
"Andrew Lunn" <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>,
"Eric Dumazet" <edumazet@google.com>,
"Jakub Kicinski" <kuba@kernel.org>,
"Paolo Abeni" <pabeni@redhat.com>,
"Richard Cochran" <richardcochran@gmail.com>,
"Russell King" <linux@armlinux.org.uk>,
"Jacob Keller" <jacob.e.keller@intel.com>,
"Michal Swiatkowski" <michal.swiatkowski@linux.intel.com>,
"Simon Horman" <horms@kernel.org>, "Kees Cook" <kees@kernel.org>,
"Larysa Zaremba" <larysa.zaremba@intel.com>,
"Joe Damato" <joe@dama.to>, "Breno Leitao" <leitao@debian.org>,
"Aleksandr Loktionov" <aleksandr.loktionov@intel.com>,
"Uwe Kleine-König (The Capable Hub)"
<u.kleine-koenig@baylibre.com>,
"Fabio Baltieri" <fabio.baltieri@gmail.com>,
"Thomas Gleixner" <tglx@kernel.org>,
"Greg Kroah-Hartman" <gregkh@linuxfoundation.org>,
"Jiawen Wu" <jiawenwu@trustnetic.com>
Subject: [PATCH net-next v6 5/5] net: wangxun: add pcie error handler
Date: Wed, 10 Jun 2026 14:09:17 +0800 [thread overview]
Message-ID: <20260610060917.23980-6-jiawenwu@trustnetic.com> (raw)
In-Reply-To: <20260610060917.23980-1-jiawenwu@trustnetic.com>
Support AER driver to handle the PCIe errors. Sometimes netdev watchdog
Tx timeout happens before the AER error report when a PCIe error occurs,
CPU blocking would be caused by MMIO during the reset process. To
prevent it, check PCIe error status in .ndo_tx_timeout. The current
function of ngbe is not yet fully developed, it will be completed in the
future.
Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
---
drivers/net/ethernet/wangxun/libwx/wx_err.c | 148 +++++++++++++++++-
drivers/net/ethernet/wangxun/libwx/wx_err.h | 2 +
drivers/net/ethernet/wangxun/libwx/wx_type.h | 4 +
drivers/net/ethernet/wangxun/ngbe/ngbe_main.c | 33 +++-
.../net/ethernet/wangxun/txgbe/txgbe_main.c | 30 +++-
5 files changed, 212 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_err.c b/drivers/net/ethernet/wangxun/libwx/wx_err.c
index ee27f96735dc..aca52b9e8260 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_err.c
+++ b/drivers/net/ethernet/wangxun/libwx/wx_err.c
@@ -4,11 +4,124 @@
#include <linux/netdevice.h>
#include <linux/pci.h>
+#include <linux/aer.h>
#include "wx_type.h"
#include "wx_lib.h"
#include "wx_err.h"
+/**
+ * wx_io_error_detected - called when PCI error is detected
+ * @pdev: Pointer to PCI device
+ * @state: The current pci connection state
+ *
+ * Return: pci_ers_result_t.
+ *
+ * This function is called after a PCI bus error affecting
+ * this device has been detected.
+ */
+static pci_ers_result_t wx_io_error_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct wx *wx = pci_get_drvdata(pdev);
+ struct net_device *netdev;
+
+ if (!wx)
+ return PCI_ERS_RESULT_DISCONNECT;
+
+ netdev = wx->netdev;
+ if (!netif_device_present(netdev))
+ return PCI_ERS_RESULT_DISCONNECT;
+
+ if (state == pci_channel_io_perm_failure)
+ return PCI_ERS_RESULT_DISCONNECT;
+
+ rtnl_lock();
+ netif_device_detach(netdev);
+ set_bit(WX_FLAG_NEED_PCIE_RECOVERY, wx->flags);
+ wx_soft_quiesce(wx);
+
+ if (!test_and_set_bit(WX_STATE_DISABLED, wx->state))
+ pci_disable_device(pdev);
+ rtnl_unlock();
+
+ /* Request a slot reset. */
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+/**
+ * wx_io_slot_reset - called after the pci bus has been reset.
+ * @pdev: Pointer to PCI device
+ *
+ * Return: pci_ers_result_t.
+ *
+ * Restart the card from scratch, as if from a cold-boot.
+ */
+static pci_ers_result_t wx_io_slot_reset(struct pci_dev *pdev)
+{
+ struct wx *wx = pci_get_drvdata(pdev);
+ pci_ers_result_t result;
+
+ if (pci_enable_device_mem(pdev)) {
+ wx_err(wx, "Cannot re-enable PCI device after reset.\n");
+ result = PCI_ERS_RESULT_DISCONNECT;
+ } else {
+ /* make all memory operations done before clearing the flag */
+ smp_mb__before_atomic();
+ clear_bit(WX_STATE_DISABLED, wx->state);
+ clear_bit(WX_FLAG_NEED_PCIE_RECOVERY, wx->flags);
+ pci_set_master(pdev);
+ pci_restore_state(pdev);
+ pci_wake_from_d3(pdev, false);
+
+ rtnl_lock();
+ if (netif_running(wx->netdev) && wx->down_suspend)
+ wx->down_suspend(wx);
+ if (wx->do_reset)
+ wx->do_reset(wx->netdev, false);
+ rtnl_unlock();
+ result = PCI_ERS_RESULT_RECOVERED;
+ }
+
+ pci_aer_clear_nonfatal_status(pdev);
+
+ return result;
+}
+
+/**
+ * wx_io_resume - called when traffic can start flowing again.
+ * @pdev: Pointer to PCI device
+ *
+ * This callback is called when the error recovery driver tells us that
+ * its OK to resume normal operation.
+ */
+static void wx_io_resume(struct pci_dev *pdev)
+{
+ struct wx *wx = pci_get_drvdata(pdev);
+ struct net_device *netdev;
+ int err;
+
+ netdev = wx->netdev;
+ rtnl_lock();
+ if (netif_running(netdev)) {
+ err = netdev->netdev_ops->ndo_open(netdev);
+ if (err) {
+ wx_err(wx, "Failed to open netdev after reset\n");
+ goto out;
+ }
+ }
+ netif_device_attach(netdev);
+out:
+ rtnl_unlock();
+}
+
+const struct pci_error_handlers wx_err_handler = {
+ .error_detected = wx_io_error_detected,
+ .slot_reset = wx_io_slot_reset,
+ .resume = wx_io_resume,
+};
+EXPORT_SYMBOL(wx_err_handler);
+
static void wx_pf_reset_subtask(struct wx *wx)
{
if (!test_and_clear_bit(WX_FLAG_NEED_PF_RESET, wx->flags))
@@ -25,6 +138,9 @@ static void wx_reset_task(struct work_struct *work)
rtnl_lock();
+ if (test_bit(WX_FLAG_NEED_PCIE_RECOVERY, wx->flags))
+ wx_soft_quiesce(wx);
+
if (test_bit(WX_STATE_DOWN, wx->state) ||
test_bit(WX_STATE_RESETTING, wx->state))
goto out;
@@ -139,6 +255,33 @@ void wx_check_hang_subtask(struct wx *wx)
}
EXPORT_SYMBOL(wx_check_hang_subtask);
+static bool wx_check_pcie_error(struct wx *wx)
+{
+ u16 vid, pci_cmd;
+
+ pci_read_config_word(wx->pdev, PCI_VENDOR_ID, &vid);
+ pci_read_config_word(wx->pdev, PCI_COMMAND, &pci_cmd);
+
+ /* PCIe link loss or memory space can't access */
+ if (vid == 0xFFFF || !(pci_cmd & 0x2))
+ return true;
+
+ return false;
+}
+
+static void wx_tx_timeout_recovery(struct wx *wx)
+{
+ /*
+ * When a PCIe hardware error occurs, the driver should initiate a PCIe
+ * recovery mechanism. However, this recovery flow relies on the AER
+ * driver for current kernel policy. Therefore, a self-contained
+ * recovery mechanism is not implemented yet.
+ */
+ set_bit(WX_FLAG_NEED_PCIE_RECOVERY, wx->flags);
+ wx_err(wx, "PCIe error detected during tx timeout\n");
+ queue_work(wx->reset_wq, &wx->reset_task);
+}
+
static void wx_tx_timeout_reset(struct wx *wx)
{
if (test_bit(WX_STATE_DOWN, wx->state))
@@ -153,7 +296,10 @@ void wx_tx_timeout(struct net_device *netdev, unsigned int __always_unused txque
{
struct wx *wx = netdev_priv(netdev);
- wx_tx_timeout_reset(wx);
+ if (wx_check_pcie_error(wx))
+ wx_tx_timeout_recovery(wx);
+ else
+ wx_tx_timeout_reset(wx);
}
EXPORT_SYMBOL(wx_tx_timeout);
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_err.h b/drivers/net/ethernet/wangxun/libwx/wx_err.h
index 1eed13e48095..a6a82a263528 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_err.h
+++ b/drivers/net/ethernet/wangxun/libwx/wx_err.h
@@ -7,6 +7,8 @@
#ifndef _WX_ERR_H_
#define _WX_ERR_H_
+extern const struct pci_error_handlers wx_err_handler;
+
void wx_check_err_subtask(struct wx *wx);
int wx_init_err_task(struct wx *wx);
void wx_check_hang_subtask(struct wx *wx);
diff --git a/drivers/net/ethernet/wangxun/libwx/wx_type.h b/drivers/net/ethernet/wangxun/libwx/wx_type.h
index a8b4e84787f4..c2edb74881f2 100644
--- a/drivers/net/ethernet/wangxun/libwx/wx_type.h
+++ b/drivers/net/ethernet/wangxun/libwx/wx_type.h
@@ -1221,6 +1221,8 @@ enum wx_state {
WX_STATE_PTP_RUNNING,
WX_STATE_PTP_TX_IN_PROGRESS,
WX_STATE_SERVICE_SCHED,
+ WX_STATE_DISABLED,
+ WX_STATE_RES_FREED,
WX_STATE_NBITS /* must be last */
};
@@ -1288,6 +1290,7 @@ enum wx_pf_flags {
WX_FLAG_RX_MERGE_ENABLED,
WX_FLAG_TXHEAD_WB_ENABLED,
WX_FLAG_NEED_PF_RESET,
+ WX_FLAG_NEED_PCIE_RECOVERY,
WX_PF_FLAGS_NBITS /* must be last */
};
@@ -1409,6 +1412,7 @@ struct wx {
void (*configure_fdir)(struct wx *wx);
int (*setup_tc)(struct net_device *netdev, u8 tc);
void (*do_reset)(struct net_device *netdev, bool reinit);
+ void (*down_suspend)(struct wx *wx);
int (*ptp_setup_sdp)(struct wx *wx);
void (*set_num_queues)(struct wx *wx);
diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
index 7dd3e12d48aa..7585d4fe4442 100644
--- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
+++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
@@ -47,6 +47,22 @@ static const struct pci_device_id ngbe_pci_tbl[] = {
{ }
};
+static void ngbe_down_suspend(struct wx *wx)
+{
+ if (test_and_set_bit(WX_STATE_RES_FREED, wx->state))
+ return;
+
+ phylink_stop(wx->phylink);
+ phylink_disconnect_phy(wx->phylink);
+
+ wx_clean_all_tx_rings(wx);
+ wx_clean_all_rx_rings(wx);
+
+ wx_free_irq(wx);
+ wx_free_isb_resources(wx);
+ wx_free_resources(wx);
+}
+
/**
* ngbe_init_type_code - Initialize the shared code
* @wx: pointer to hardware structure
@@ -135,6 +151,7 @@ static int ngbe_sw_init(struct wx *wx)
wx->mbx.size = WX_VXMAILBOX_SIZE;
wx->setup_tc = ngbe_setup_tc;
wx->do_reset = ngbe_do_reset;
+ wx->down_suspend = ngbe_down_suspend;
set_bit(0, &wx->fwd_bitmask);
return 0;
@@ -413,6 +430,9 @@ static void ngbe_disable_device(struct wx *wx)
static void ngbe_reset(struct wx *wx)
{
+ if (test_bit(WX_FLAG_NEED_PCIE_RECOVERY, wx->flags))
+ return;
+
wx_flush_sw_mac_table(wx);
wx_mac_set_default_filter(wx, wx->mac.addr);
if (test_bit(WX_STATE_PTP_RUNNING, wx->state))
@@ -435,6 +455,7 @@ static void ngbe_up_complete(struct wx *wx)
/* make sure to complete pre-operations */
smp_mb__before_atomic();
clear_bit(WX_STATE_DOWN, wx->state);
+ clear_bit(WX_STATE_RES_FREED, wx->state);
wx_napi_enable_all(wx);
/* enable transmits */
netif_tx_start_all_queues(wx->netdev);
@@ -529,6 +550,9 @@ static int ngbe_close(struct net_device *netdev)
{
struct wx *wx = netdev_priv(netdev);
+ if (test_bit(WX_STATE_RES_FREED, wx->state))
+ return 0;
+
wx_ptp_stop(wx);
ngbe_down(wx);
wx_free_irq(wx);
@@ -566,7 +590,8 @@ static void ngbe_dev_shutdown(struct pci_dev *pdev, bool *enable_wake)
*enable_wake = !!wufc;
wx_control_hw(wx, false);
- pci_disable_device(pdev);
+ if (!test_and_set_bit(WX_STATE_DISABLED, wx->state))
+ pci_disable_device(pdev);
}
static void ngbe_shutdown(struct pci_dev *pdev)
@@ -856,6 +881,7 @@ static int ngbe_probe(struct pci_dev *pdev,
goto err_register;
pci_set_drvdata(pdev, wx);
+ pci_save_state(pdev);
return 0;
@@ -911,7 +937,8 @@ static void ngbe_remove(struct pci_dev *pdev)
kfree(wx->mac_table);
wx_clear_interrupt_scheme(wx);
- pci_disable_device(pdev);
+ if (!test_and_set_bit(WX_STATE_DISABLED, wx->state))
+ pci_disable_device(pdev);
}
static int ngbe_suspend(struct pci_dev *pdev, pm_message_t state)
@@ -938,6 +965,7 @@ static int ngbe_resume(struct pci_dev *pdev)
wx_err(wx, "Cannot enable PCI device from suspend\n");
return err;
}
+ clear_bit(WX_STATE_DISABLED, wx->state);
pci_set_master(pdev);
device_wakeup_disable(&pdev->dev);
@@ -962,6 +990,7 @@ static struct pci_driver ngbe_driver = {
.resume = ngbe_resume,
.shutdown = ngbe_shutdown,
.sriov_configure = wx_pci_sriov_configure,
+ .err_handler = &wx_err_handler,
};
module_pci_driver(ngbe_driver);
diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c
index f6e596eb9217..bee42ac234c2 100644
--- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c
+++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c
@@ -155,6 +155,7 @@ static void txgbe_up_complete(struct wx *wx)
/* make sure to complete pre-operations */
smp_mb__before_atomic();
clear_bit(WX_STATE_DOWN, wx->state);
+ clear_bit(WX_STATE_RES_FREED, wx->state);
wx_napi_enable_all(wx);
switch (wx->mac.type) {
@@ -198,6 +199,9 @@ static void txgbe_reset(struct wx *wx)
u8 old_addr[ETH_ALEN];
int err;
+ if (test_bit(WX_FLAG_NEED_PCIE_RECOVERY, wx->flags))
+ return;
+
err = txgbe_reset_hw(wx);
if (err != 0)
wx_err(wx, "Hardware Error: %d\n", err);
@@ -304,6 +308,20 @@ void txgbe_up(struct wx *wx)
txgbe_up_complete(wx);
}
+static void txgbe_down_suspend(struct wx *wx)
+{
+ if (test_and_set_bit(WX_STATE_RES_FREED, wx->state))
+ return;
+
+ phylink_stop(wx->phylink);
+ wx_clean_all_tx_rings(wx);
+ wx_clean_all_rx_rings(wx);
+ wx_free_irq(wx);
+ txgbe_free_misc_irq(wx->priv);
+ wx_free_resources(wx);
+ txgbe_fdir_filter_exit(wx);
+}
+
/**
* txgbe_init_type_code - Initialize the shared code
* @wx: pointer to hardware structure
@@ -420,6 +438,7 @@ static int txgbe_sw_init(struct wx *wx)
wx->setup_tc = txgbe_setup_tc;
wx->do_reset = txgbe_do_reset;
+ wx->down_suspend = txgbe_down_suspend;
set_bit(0, &wx->fwd_bitmask);
switch (wx->mac.type) {
@@ -530,6 +549,9 @@ static int txgbe_close(struct net_device *netdev)
{
struct wx *wx = netdev_priv(netdev);
+ if (test_bit(WX_STATE_RES_FREED, wx->state))
+ return 0;
+
wx_ptp_stop(wx);
txgbe_down(wx);
wx_free_irq(wx);
@@ -556,7 +578,8 @@ static void txgbe_dev_shutdown(struct pci_dev *pdev)
wx_control_hw(wx, false);
- pci_disable_device(pdev);
+ if (!test_and_set_bit(WX_STATE_DISABLED, wx->state))
+ pci_disable_device(pdev);
}
static void txgbe_shutdown(struct pci_dev *pdev)
@@ -908,6 +931,7 @@ static int txgbe_probe(struct pci_dev *pdev,
goto err_remove_phy;
pci_set_drvdata(pdev, wx);
+ pci_save_state(pdev);
netif_tx_stop_all_queues(netdev);
@@ -982,7 +1006,8 @@ static void txgbe_remove(struct pci_dev *pdev)
kfree(wx->mac_table);
wx_clear_interrupt_scheme(wx);
- pci_disable_device(pdev);
+ if (!test_and_set_bit(WX_STATE_DISABLED, wx->state))
+ pci_disable_device(pdev);
}
static struct pci_driver txgbe_driver = {
@@ -992,6 +1017,7 @@ static struct pci_driver txgbe_driver = {
.remove = txgbe_remove,
.shutdown = txgbe_shutdown,
.sriov_configure = wx_pci_sriov_configure,
+ .err_handler = &wx_err_handler,
};
module_pci_driver(txgbe_driver);
--
2.51.0
next prev parent reply other threads:[~2026-06-10 6:12 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-10 6:09 [PATCH net-next v6 0/5] net: wangxun: timeout and error Jiawen Wu
2026-06-10 6:09 ` [PATCH net-next v6 1/5] net: ngbe: implement libwx reset ops Jiawen Wu
2026-06-10 6:09 ` [PATCH net-next v6 2/5] net: wangxun: add Tx timeout process Jiawen Wu
2026-06-10 6:09 ` [PATCH net-next v6 3/5] net: wangxun: add reinit parameter to wx->do_reset callback Jiawen Wu
2026-06-10 6:09 ` [PATCH net-next v6 4/5] net: wangxun: implement soft quiesce for PCIe error recovery Jiawen Wu
2026-06-12 15:49 ` Simon Horman
2026-06-10 6:09 ` Jiawen Wu [this message]
2026-06-12 15:41 ` [PATCH net-next v6 5/5] net: wangxun: add pcie error handler Simon Horman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260610060917.23980-6-jiawenwu@trustnetic.com \
--to=jiawenwu@trustnetic.com \
--cc=aleksandr.loktionov@intel.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=fabio.baltieri@gmail.com \
--cc=gregkh@linuxfoundation.org \
--cc=horms@kernel.org \
--cc=jacob.e.keller@intel.com \
--cc=joe@dama.to \
--cc=kees@kernel.org \
--cc=kuba@kernel.org \
--cc=larysa.zaremba@intel.com \
--cc=leitao@debian.org \
--cc=linux@armlinux.org.uk \
--cc=mengyuanlou@net-swift.com \
--cc=michal.swiatkowski@linux.intel.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=richardcochran@gmail.com \
--cc=tglx@kernel.org \
--cc=u.kleine-koenig@baylibre.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox