From: Raju Rangoju <Raju.Rangoju@amd.com>
To: <netdev@vger.kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <pabeni@redhat.com>,
<kuba@kernel.org>, <edumazet@google.com>, <davem@davemloft.net>,
<andrew+netdev@lunn.ch>, <Thomas.Lendacky@amd.com>,
<maxime.chevallier@bootlin.com>,
Raju Rangoju <Raju.Rangoju@amd.com>
Subject: [PATCH v2 net] amd-xgbe: synchronize KR training with device operations
Date: Wed, 18 Mar 2026 14:46:08 +0530 [thread overview]
Message-ID: <20260318091608.1266381-1-Raju.Rangoju@amd.com> (raw)
During 10GBASE-KR link training, the PHY state machine can be corrupted
if device stop or rate change operations are initiated while training is
in progress. This manifests as:
- Link stability issues after interface down/up cycles
- PHY state machine lockups requiring a full driver reset
- Intermittent link failures on Inphi re-driver configurations
The root cause is that the firmware mailbox operations for device stop
and rate changes can interfere with ongoing KR training sequences,
leaving the PHY in an inconsistent state.
Add synchronization to prevent device operations from interrupting
active KR training:
- Introduce a mailbox mutex to serialize firmware command access
- Wait for KR training completion (or timeout) before proceeding
with stop/rate change operations
- Only wait when KR training is actually active (KR mode with
autoneg enabled or Inphi re-driver present)
- Use a 500ms timeout to handle hung training sequences
The mailbox mutex protects the critical section of firmware command
submission and completion checking, preventing concurrent mailbox
access from multiple code paths.
Testing on AMD platforms with both direct-attach and Inphi re-driver
configurations shows this eliminates PHY state corruption during
interface operations and link changes.
Fixes: 549b32af9f7c ("amd-xgbe: Simplify mailbox interface rate change code")
Signed-off-by: Raju Rangoju <Raju.Rangoju@amd.com>
---
Changes since v1:
- use scoped_guard() instead of guard() for functions longer than 20
lines
drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +
drivers/net/ethernet/amd/xgbe/xgbe-main.c | 1 +
drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c | 77 ++++++++++++++++++---
drivers/net/ethernet/amd/xgbe/xgbe.h | 5 ++
4 files changed, 74 insertions(+), 11 deletions(-)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 23beea48ae26..3913eb7e1da3 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1321,6 +1321,8 @@ static void xgbe_stop(struct xgbe_prv_data *pdata)
DBGPR("-->xgbe_stop\n");
+ xgbe_check_kr_training_in_progress(pdata);
+
if (test_bit(XGBE_STOPPED, &pdata->dev_state))
return;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 7d45ea22a02e..5f3ab29707b7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -78,6 +78,7 @@ struct xgbe_prv_data *xgbe_alloc_pdata(struct device *dev)
spin_lock_init(&pdata->xpcs_lock);
mutex_init(&pdata->rss_mutex);
+ mutex_init(&pdata->mailbox_lock);
spin_lock_init(&pdata->tstamp_lock);
mutex_init(&pdata->i2c_mutex);
init_completion(&pdata->i2c_complete);
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index b8cf6ccfe641..4c1ecbdfcabc 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -2095,12 +2095,57 @@ static void xgbe_phy_pll_ctrl(struct xgbe_prv_data *pdata, bool enable)
usleep_range(100, 200);
}
+static bool xgbe_phy_port_is_inphi(struct xgbe_prv_data *pdata)
+{
+ struct xgbe_phy_data *phy_data = pdata->phy_data;
+
+ /* Re-driver models 4223 && 4227 are supported Inphi models */
+ return phy_data->redrv &&
+ (phy_data->redrv_model == XGBE_PHY_REDRV_MODEL_4223 ||
+ phy_data->redrv_model == XGBE_PHY_REDRV_MODEL_4227);
+}
+
+void xgbe_check_kr_training_in_progress(struct xgbe_prv_data *pdata)
+{
+ struct xgbe_phy_data *phy_data = pdata->phy_data;
+ unsigned long kr_timeout;
+ int wait;
+
+ /* Only wait for KR training in specific conditions:
+ * - Inphi re-driver is present, OR
+ * - Currently in KR mode with autoneg enabled
+ */
+ if (!xgbe_phy_port_is_inphi(pdata) &&
+ !(phy_data->cur_mode == XGBE_MODE_KR &&
+ pdata->phy.autoneg == AUTONEG_ENABLE))
+ return;
+
+ wait = XGBE_KR_TRAINING_WAIT_ITER;
+ while (wait--) {
+ /* Check if we've exceeded the AN timeout window */
+ kr_timeout = pdata->kr_start_time +
+ msecs_to_jiffies(XGBE_AN_MS_TIMEOUT +
+ XGBE_KR_TRAINING_WAIT_MS);
+ if (time_after(jiffies, kr_timeout))
+ break;
+
+ /* Training is complete - no need to wait */
+ if (pdata->an_result == XGBE_AN_COMPLETE)
+ return;
+
+ usleep_range(10000, 11000);
+ }
+}
+
static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata,
- enum xgbe_mb_cmd cmd, enum xgbe_mb_subcmd sub_cmd)
+ enum xgbe_mb_cmd cmd,
+ enum xgbe_mb_subcmd sub_cmd)
{
unsigned int s0 = 0;
unsigned int wait;
+ xgbe_check_kr_training_in_progress(pdata);
+
/* Disable PLL re-initialization during FW command processing */
xgbe_phy_pll_ctrl(pdata, false);
@@ -2115,20 +2160,30 @@ static void xgbe_phy_perform_ratechange(struct xgbe_prv_data *pdata,
XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, COMMAND, cmd);
XP_SET_BITS(s0, XP_DRIVER_SCRATCH_0, SUB_COMMAND, sub_cmd);
- /* Issue the command */
- XP_IOWRITE(pdata, XP_DRIVER_SCRATCH_0, s0);
- XP_IOWRITE(pdata, XP_DRIVER_SCRATCH_1, 0);
- XP_IOWRITE_BITS(pdata, XP_DRIVER_INT_REQ, REQUEST, 1);
+ /* Acquire mailbox lock for firmware command.
+ * Lock is released on break, preventing recursive deadlock when
+ * rx_adaptation calls back into this function.
+ */
+ scoped_guard(mutex, &pdata->mailbox_lock) {
+ /* Issue the firmware command */
- /* Wait for command to complete */
- wait = XGBE_RATECHANGE_COUNT;
- while (wait--) {
- if (!XP_IOREAD_BITS(pdata, XP_DRIVER_INT_RO, STATUS))
- goto do_rx_adaptation;
+ XP_IOWRITE(pdata, XP_DRIVER_SCRATCH_0, s0);
+ XP_IOWRITE(pdata, XP_DRIVER_SCRATCH_1, 0);
+ XP_IOWRITE_BITS(pdata, XP_DRIVER_INT_REQ, REQUEST, 1);
- usleep_range(1000, 2000);
+ /* Wait for command to complete */
+ wait = XGBE_RATECHANGE_COUNT;
+ while (wait--) {
+ if (!XP_IOREAD_BITS(pdata, XP_DRIVER_INT_RO, STATUS))
+ break;
+
+ usleep_range(1000, 2000);
+ }
}
+ if (wait != (unsigned int)-1)
+ goto do_rx_adaptation;
+
netif_dbg(pdata, link, pdata->netdev,
"firmware mailbox command did not complete\n");
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 438033a71523..238eeee0d422 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -202,6 +202,7 @@
#define XGBE_AN_MS_TIMEOUT 500
#define XGBE_LINK_TIMEOUT 5
#define XGBE_KR_TRAINING_WAIT_ITER 50
+#define XGBE_KR_TRAINING_WAIT_MS 100
#define XGBE_SGMII_AN_LINK_DUPLEX BIT(1)
#define XGBE_SGMII_AN_LINK_SPEED (BIT(2) | BIT(3))
@@ -1015,6 +1016,9 @@ struct xgbe_prv_data {
/* RSS addressing mutex */
struct mutex rss_mutex;
+ /* Firmware mailbox mutex */
+ struct mutex mailbox_lock;
+
/* Flags representing xgbe_state */
unsigned long dev_state;
@@ -1252,6 +1256,7 @@ struct xgbe_prv_data {
};
/* Function prototypes*/
+void xgbe_check_kr_training_in_progress(struct xgbe_prv_data *pdata);
struct xgbe_prv_data *xgbe_alloc_pdata(struct device *);
void xgbe_free_pdata(struct xgbe_prv_data *);
void xgbe_set_counts(struct xgbe_prv_data *);
--
2.34.1
next reply other threads:[~2026-03-18 9:18 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-18 9:16 Raju Rangoju [this message]
2026-03-21 2:55 ` [PATCH v2 net] amd-xgbe: synchronize KR training with device operations Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260318091608.1266381-1-Raju.Rangoju@amd.com \
--to=raju.rangoju@amd.com \
--cc=Thomas.Lendacky@amd.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=maxime.chevallier@bootlin.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox