* [PATCH v6 14/21] net/txgbe: fix link stability for Amber-Lite backplane mode
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
The link was previously configured via firmware, but this approach
resulted in unstable link behavior. To resolve the issue, re-add the
PHY configuration flow directly into the driver.
Fixes: ead3616f630d ("net/txgbe: support PHY configuration via SW-FW mailbox")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/base/meson.build | 1 +
drivers/net/txgbe/base/txgbe.h | 2 +
drivers/net/txgbe/base/txgbe_aml.c | 65 +-
drivers/net/txgbe/base/txgbe_aml40.c | 43 +-
drivers/net/txgbe/base/txgbe_e56.c | 22 +-
drivers/net/txgbe/base/txgbe_e56.h | 2 +
drivers/net/txgbe/base/txgbe_e56_bp.c | 2597 +++++++++++++++++++++++++
drivers/net/txgbe/base/txgbe_e56_bp.h | 3 +
drivers/net/txgbe/base/txgbe_hw.c | 6 +
drivers/net/txgbe/base/txgbe_hw.h | 4 +-
drivers/net/txgbe/base/txgbe_osdep.h | 4 +
drivers/net/txgbe/base/txgbe_phy.c | 21 +
drivers/net/txgbe/base/txgbe_phy.h | 22 +
drivers/net/txgbe/base/txgbe_type.h | 25 +-
drivers/net/txgbe/txgbe_ethdev.c | 109 +-
drivers/net/txgbe/txgbe_ethdev.h | 2 +-
16 files changed, 2898 insertions(+), 30 deletions(-)
create mode 100644 drivers/net/txgbe/base/txgbe_e56_bp.c
diff --git a/drivers/net/txgbe/base/meson.build b/drivers/net/txgbe/base/meson.build
index 305c0291e3..a9a02577ce 100644
--- a/drivers/net/txgbe/base/meson.build
+++ b/drivers/net/txgbe/base/meson.build
@@ -13,4 +13,5 @@ base_sources = files(
'txgbe_phy.c',
'txgbe_vf.c',
'txgbe_e56.c',
+ 'txgbe_e56_bp.c',
)
diff --git a/drivers/net/txgbe/base/txgbe.h b/drivers/net/txgbe/base/txgbe.h
index 673a299860..27c3e3be38 100644
--- a/drivers/net/txgbe/base/txgbe.h
+++ b/drivers/net/txgbe/base/txgbe.h
@@ -13,5 +13,7 @@
#include "txgbe_hw.h"
#include "txgbe_vf.h"
#include "txgbe_dcb.h"
+#include "txgbe_e56.h"
+#include "txgbe_e56_bp.h"
#endif /* _TXGBE_H_ */
diff --git a/drivers/net/txgbe/base/txgbe_aml.c b/drivers/net/txgbe/base/txgbe_aml.c
index 6388893bca..5d449a0bd9 100644
--- a/drivers/net/txgbe/base/txgbe_aml.c
+++ b/drivers/net/txgbe/base/txgbe_aml.c
@@ -13,6 +13,7 @@
#include "txgbe_hw.h"
#include "txgbe_aml.h"
#include "txgbe_e56.h"
+#include "txgbe_e56_bp.h"
void txgbe_init_ops_aml(struct txgbe_hw *hw)
{
@@ -84,6 +85,13 @@ s32 txgbe_check_mac_link_aml(struct txgbe_hw *hw, u32 *speed,
*speed = TXGBE_LINK_SPEED_UNKNOWN;
}
+ if (txgbe_xpcs_an_enabled(hw)) {
+ if (!hw->an_done) {
+ *link_up = false;
+ *speed = TXGBE_LINK_SPEED_UNKNOWN;
+ }
+ }
+
return 0;
}
@@ -95,23 +103,41 @@ s32 txgbe_get_link_capabilities_aml(struct txgbe_hw *hw,
*speed = TXGBE_LINK_SPEED_10GB_FULL |
TXGBE_LINK_SPEED_25GB_FULL;
*autoneg = true;
+ } else if (hw->phy.sfp_type == txgbe_sfp_type_da_cu_core0 ||
+ hw->phy.sfp_type == txgbe_sfp_type_da_cu_core1) {
+ if (hw->phy.fiber_suppport_speed ==
+ TXGBE_LINK_SPEED_10GB_FULL) {
+ hw->devarg.auto_neg = false;
+ *autoneg = false;
+ } else {
+ *autoneg = true;
+ }
+ *speed = hw->phy.fiber_suppport_speed;
} else if (hw->phy.sfp_type == txgbe_sfp_type_25g_sr_core0 ||
hw->phy.sfp_type == txgbe_sfp_type_25g_sr_core1 ||
hw->phy.sfp_type == txgbe_sfp_type_25g_lr_core0 ||
- hw->phy.sfp_type == txgbe_sfp_type_25g_lr_core1) {
+ hw->phy.sfp_type == txgbe_sfp_type_25g_lr_core1 ||
+ hw->phy.sfp_type == txgbe_sfp_type_25g_aoc_core0 ||
+ hw->phy.sfp_type == txgbe_sfp_type_25g_aoc_core1) {
*speed = TXGBE_LINK_SPEED_25GB_FULL;
*autoneg = false;
- } else if (hw->phy.sfp_type == txgbe_sfp_type_25g_aoc_core0 ||
- hw->phy.sfp_type == txgbe_sfp_type_25g_aoc_core1) {
- *speed = TXGBE_LINK_SPEED_25GB_FULL;
+ } else if (hw->phy.media_type == txgbe_media_type_backplane) {
+ /* Backplane */
+ *speed = TXGBE_LINK_SPEED_10GB_FULL |
+ TXGBE_LINK_SPEED_25GB_FULL;
+ /* Backplane supports autonegotiation */
+ *autoneg = hw->devarg.auto_neg;
+ } else if (hw->phy.media_type == txgbe_media_type_fiber) {
+ /* Fiber */
+ *speed = TXGBE_LINK_SPEED_10GB_FULL |
+ TXGBE_LINK_SPEED_25GB_FULL;
*autoneg = false;
} else {
- /* SFP */
- if (hw->phy.sfp_type == txgbe_sfp_type_not_present)
- *speed = TXGBE_LINK_SPEED_25GB_FULL;
- else
- *speed = TXGBE_LINK_SPEED_10GB_FULL;
- *autoneg = true;
+ /* Unknown */
+ *speed = TXGBE_LINK_SPEED_UNKNOWN;
+ *autoneg = false;
+ PMD_DRV_LOG(DEBUG, "GET link capabilities failed");
+ return TXGBE_ERR_LINK_SETUP;
}
return 0;
@@ -193,7 +219,7 @@ s32 txgbe_setup_phy_link_aml(struct txgbe_hw *hw,
*need_reset = false;
- if (hw->phy.sfp_type == txgbe_sfp_type_not_present) {
+ if (hw->phy.sfp_type == txgbe_sfp_type_not_present && !txgbe_is_backplane(hw)) {
DEBUGOUT("SFP not detected, skip setup mac link");
return 0;
}
@@ -216,6 +242,23 @@ s32 txgbe_setup_phy_link_aml(struct txgbe_hw *hw,
if (speed == TXGBE_LINK_SPEED_UNKNOWN)
return TXGBE_ERR_LINK_SETUP;
+ if (txgbe_xpcs_an_enabled(hw)) {
+ txgbe_e56_check_phy_link(hw, &link_speed, &link_up);
+ if (link_up && hw->an_done && !autoneg_wait_to_complete)
+ return status;
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_set_phy_link_mode(hw, speed, autoneg_wait_to_complete);
+ rte_spinlock_unlock(&hw->phy_lock);
+ return 0;
+ }
+
+ if (txgbe_is_backplane(hw) || txgbe_is_dac_cable(hw) ||
+ hw->phy.ffe_set) {
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_tx_ffe_cfg(hw, speed);
+ rte_spinlock_unlock(&hw->phy_lock);
+ }
+
if (txgbe_gpio_ext_check(hw, TXGBE_SFP1_MOD_ABS_LS |
TXGBE_SFP1_RX_LOS_LS)) {
DEBUGOUT("RX LOS");
diff --git a/drivers/net/txgbe/base/txgbe_aml40.c b/drivers/net/txgbe/base/txgbe_aml40.c
index 09bc7ed58c..1098efe5e6 100644
--- a/drivers/net/txgbe/base/txgbe_aml40.c
+++ b/drivers/net/txgbe/base/txgbe_aml40.c
@@ -14,6 +14,7 @@
#include "txgbe_aml.h"
#include "txgbe_aml40.h"
#include "txgbe_e56.h"
+#include "txgbe_e56_bp.h"
void txgbe_init_ops_aml40(struct txgbe_hw *hw)
{
@@ -98,7 +99,10 @@ s32 txgbe_get_link_capabilities_aml40(struct txgbe_hw *hw,
if (hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core0 ||
hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core1) {
*speed = TXGBE_LINK_SPEED_40GB_FULL;
- *autoneg = false;
+ *autoneg = true;
+ } else if (txgbe_is_backplane(hw)) {
+ *speed = TXGBE_LINK_SPEED_40GB_FULL;
+ *autoneg = true;
} else {
/*
* Temporary workaround: set speed to 40G even if sfp not present
@@ -115,8 +119,22 @@ s32 txgbe_get_link_capabilities_aml40(struct txgbe_hw *hw,
u32 txgbe_get_media_type_aml40(struct txgbe_hw *hw)
{
- UNREFERENCED_PARAMETER(hw);
- return txgbe_media_type_fiber_qsfp;
+ u8 device_type = hw->subsystem_device_id & 0xF0;
+ enum txgbe_media_type media_type;
+
+ switch (device_type) {
+ case TXGBE_DEV_ID_KR_KX_KX4:
+ media_type = txgbe_media_type_backplane;
+ break;
+ case TXGBE_DEV_ID_SFP:
+ media_type = txgbe_media_type_fiber_qsfp;
+ break;
+ default:
+ media_type = txgbe_media_type_unknown;
+ break;
+ }
+
+ return media_type;
}
s32 txgbe_setup_phy_link_aml40(struct txgbe_hw *hw,
@@ -135,7 +153,7 @@ s32 txgbe_setup_phy_link_aml40(struct txgbe_hw *hw,
*need_reset = false;
- if (hw->phy.sfp_type == txgbe_sfp_type_not_present)
+ if (hw->phy.sfp_type == txgbe_sfp_type_not_present && !txgbe_is_backplane(hw))
hw->phy.identify_sfp(hw);
/* Check to see if speed passed in is supported. */
@@ -148,6 +166,23 @@ s32 txgbe_setup_phy_link_aml40(struct txgbe_hw *hw,
if (speed == TXGBE_LINK_SPEED_UNKNOWN)
return TXGBE_ERR_LINK_SETUP;
+ if (txgbe_xpcs_an_enabled(hw)) {
+ txgbe_e56_check_phy_link(hw, &link_speed, &link_up);
+ if (link_up && hw->an_done && !autoneg_wait_to_complete)
+ return status;
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_set_phy_link_mode(hw, 40, autoneg_wait_to_complete);
+ rte_spinlock_unlock(&hw->phy_lock);
+ return status;
+ }
+
+ if (txgbe_is_backplane(hw) || txgbe_is_dac_cable(hw) ||
+ hw->phy.ffe_set) {
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_tx_ffe_cfg(hw, speed);
+ rte_spinlock_unlock(&hw->phy_lock);
+ }
+
for (i = 0; i < 4; i++) {
txgbe_e56_check_phy_link(hw, &link_speed, &link_up);
if (link_up)
diff --git a/drivers/net/txgbe/base/txgbe_e56.c b/drivers/net/txgbe/base/txgbe_e56.c
index 3566d13426..0ac306387b 100644
--- a/drivers/net/txgbe/base/txgbe_e56.c
+++ b/drivers/net/txgbe/base/txgbe_e56.c
@@ -53,7 +53,7 @@ int txgbe_e56_int_cmp(const void *a, const void *b)
}
s32 txgbe_e56_check_phy_link(struct txgbe_hw *hw, u32 *speed,
- bool *link_up)
+ bool *link_up)
{
u32 rdata = 0;
u32 links_reg = 0;
@@ -101,7 +101,8 @@ u32 txgbe_e56_tx_ffe_cfg(struct txgbe_hw *hw, u32 speed)
post = S10G_TX_FFE_CFG_POST;
} else if (speed == TXGBE_LINK_SPEED_25GB_FULL) {
if (hw->phy.sfp_type == txgbe_sfp_type_da_cu_core0 ||
- hw->phy.sfp_type == txgbe_sfp_type_da_cu_core1) {
+ hw->phy.sfp_type == txgbe_sfp_type_da_cu_core1 ||
+ txgbe_is_backplane(hw)) {
ffe_main = S25G_TX_FFE_CFG_DAC_MAIN;
pre1 = S25G_TX_FFE_CFG_DAC_PRE1;
pre2 = S25G_TX_FFE_CFG_DAC_PRE2;
@@ -119,7 +120,8 @@ u32 txgbe_e56_tx_ffe_cfg(struct txgbe_hw *hw, u32 speed)
post = S10G_TX_FFE_CFG_POST;
if (hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core0 ||
- hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core1) {
+ hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core1 ||
+ txgbe_is_backplane(hw)) {
ffe_main = S40G_TX_FFE_CFG_MAIN;
pre1 = S40G_TX_FFE_CFG_PRE1;
pre2 = S40G_TX_FFE_CFG_PRE2;
@@ -1508,7 +1510,7 @@ txgbe_e56_rxs_osc_init_for_temp_track_range(struct txgbe_hw *hw, u32 speed)
rdata = rd32_ephy(hw, addr);
if (timer++ > PHYINIT_TIMEOUT) {
- DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!\n");
+ DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!");
return -1;
}
}
@@ -1542,7 +1544,7 @@ txgbe_e56_rxs_osc_init_for_temp_track_range(struct txgbe_hw *hw, u32 speed)
if (((rdata >> (i * 8)) & 0x3f) == 0x21)
break;
if (timer++ > PHYINIT_TIMEOUT) {
- DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!\n");
+ DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!");
return -1;
}
}
@@ -1618,7 +1620,7 @@ txgbe_e56_rxs_osc_init_for_temp_track_range(struct txgbe_hw *hw, u32 speed)
addr = E56PHY_CTRL_FSM_RX_STAT_0_ADDR;
rdata = rd32_ephy(hw, addr);
if (timer++ > PHYINIT_TIMEOUT) {
- DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!\n");
+ DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!");
return -1;
}
}
@@ -1664,7 +1666,7 @@ txgbe_e56_rxs_osc_init_for_temp_track_range(struct txgbe_hw *hw, u32 speed)
if (((rdata >> (i * 8)) & 0x3f) == 0x21)
break;
if (timer++ > PHYINIT_TIMEOUT) {
- DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!\n");
+ DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!");
return -1;
}
}
@@ -1929,7 +1931,7 @@ int txgbe_temp_track_seq_40g(struct txgbe_hw *hw, u32 speed)
CMVAR_UFINE_FMIN_WRAP = S25G_CMVAR_UFINE_FMIN_WRAP;
CMVAR_FINE_FMIN_WRAP = S25G_CMVAR_FINE_FMIN_WRAP;
} else {
- DEBUGOUT("Error Speed\n");
+ DEBUGOUT("Error Speed");
return 0;
}
@@ -3190,7 +3192,7 @@ static int txgbe_e56_disable_rx40G(struct txgbe_hw *hw)
rdata = rd32_ephy(hw, addr);
usec_delay(100);
if (timer++ > PHYINIT_TIMEOUT) {
- DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!\n");
+ DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!");
break;
}
}
@@ -3296,7 +3298,7 @@ static int txgbe_e56_disable_rx(struct txgbe_hw *hw)
break;
usec_delay(100);
if (timer++ > PHYINIT_TIMEOUT) {
- DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!\n");
+ DEBUGOUT("ERROR: Wait E56PHY_CTRL_FSM_RX_STAT_0_ADDR Timeout!");
break;
}
}
diff --git a/drivers/net/txgbe/base/txgbe_e56.h b/drivers/net/txgbe/base/txgbe_e56.h
index aeee0618a6..feb4de0546 100644
--- a/drivers/net/txgbe/base/txgbe_e56.h
+++ b/drivers/net/txgbe/base/txgbe_e56.h
@@ -1744,6 +1744,8 @@ int txgbe_temp_track_seq(struct txgbe_hw *hw, u32 speed);
int txgbe_e56_get_temp(struct txgbe_hw *hw, int *temp);
int txgbe_set_link_to_amlite(struct txgbe_hw *hw, u32 speed);
int txgbe_e56_reconfig_rx(struct txgbe_hw *hw, u32 speed);
+s32 txgbe_e56_check_phy_link(struct txgbe_hw *hw, u32 *speed,
+ bool *link_up);
s32 txgbe_e56_fec_set(struct txgbe_hw *hw);
s32 txgbe_e56_fec_polling(struct txgbe_hw *hw, bool *link_up);
u32 txgbe_e56_tx_ffe_cfg(struct txgbe_hw *hw, u32 speed);
diff --git a/drivers/net/txgbe/base/txgbe_e56_bp.c b/drivers/net/txgbe/base/txgbe_e56_bp.c
new file mode 100644
index 0000000000..1237b73914
--- /dev/null
+++ b/drivers/net/txgbe/base/txgbe_e56_bp.c
@@ -0,0 +1,2597 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2024-2026 Beijing WangXun Technology Co., Ltd.
+ */
+
+#include "txgbe_e56.h"
+#include "txgbe_hw.h"
+#include "txgbe_osdep.h"
+#include "txgbe_phy.h"
+#include "txgbe_e56_bp.h"
+#include "txgbe.h"
+#include "../txgbe_logs.h"
+
+static int
+txgbe_e56_set_rxs_ufine_le_max(struct txgbe_hw *hw, u32 speed)
+{
+ u32 rdata, addr;
+ u32 ULTRAFINE_CODE[4] = {0};
+ int lane_num = 0, lane_idx = 0;
+ u32 CMVAR_UFINE_MAX = 0;
+
+ switch (speed) {
+ case 10:
+ CMVAR_UFINE_MAX = S10G_CMVAR_UFINE_MAX;
+ lane_num = 1;
+ break;
+ case 40:
+ CMVAR_UFINE_MAX = S10G_CMVAR_UFINE_MAX;
+ lane_num = 4;
+ break;
+ case 25:
+ CMVAR_UFINE_MAX = S25G_CMVAR_UFINE_MAX;
+ lane_num = 1;
+ break;
+ default:
+ BP_LOG("%s %d :Invalid speed\n", __func__, __LINE__);
+ break;
+ }
+
+ for (lane_idx = 0; lane_idx < lane_num; lane_idx++) {
+ /* ii get rx ana_bbcdr_ultrafine_i[14, 12] per lane */
+ addr = E56G__RXS0_ANA_OVRDVAL_5_ADDR + (E56PHY_RXS_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ ULTRAFINE_CODE[lane_idx] = FIELD_GET_M(GENMASK(14, 12), rdata);
+ BP_LOG("ULTRAFINE_CODE[%d] = %d, CMVAR_UFINE_MAX: %x\n",
+ lane_idx, ULTRAFINE_CODE[lane_idx], CMVAR_UFINE_MAX);
+ }
+
+ for (lane_idx = 0; lane_idx < lane_num; lane_idx++) {
+ /* b. Perform the below logic sequence */
+ while (ULTRAFINE_CODE[lane_idx] > CMVAR_UFINE_MAX) {
+ ULTRAFINE_CODE[lane_idx] -= 1;
+ addr = E56G__RXS0_ANA_OVRDVAL_5_ADDR +
+ (E56PHY_RXS_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 14, 12, ULTRAFINE_CODE[lane_idx]);
+ wr32_ephy(hw, addr, rdata);
+
+ /* ovrd_en_ana_bbcdr_ultrafine=1 override ASIC value */
+ addr = E56G__RXS0_ANA_OVRDEN_1_ADDR +
+ (E56PHY_RXS_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ wr32_ephy(hw, addr, rdata | BIT(3));
+
+ /* Wait until 1milliseconds or greater */
+ usec_delay(1000);
+ }
+ }
+ return 0;
+}
+
+static int txgbe_e56_rxs_osc_init_for_temp_track_range(struct txgbe_hw *hw,
+ u32 speed)
+{
+ int OFFSET_CENTRE_RANGE_H[4] = {0}, OFFSET_CENTRE_RANGE_L[4] = {}, RANGE_FINAL[4] = {};
+ int RX_COARSE_MID_TD, CMVAR_RANGE_H = 0, CMVAR_RANGE_L = 0;
+ int status = 0, lane_num = 0;
+ int T = 40, lane_id = 0;
+ u32 addr, rdata;
+
+ /* Set CMVAR_RANGE_H/L based on the link speed mode */
+ switch (speed) {
+ case 10:
+ CMVAR_RANGE_H = S10G_CMVAR_RANGE_H;
+ CMVAR_RANGE_L = S10G_CMVAR_RANGE_L;
+ lane_num = 1;
+ break;
+ case 40:
+ CMVAR_RANGE_H = S10G_CMVAR_RANGE_H;
+ CMVAR_RANGE_L = S10G_CMVAR_RANGE_L;
+ lane_num = 4;
+ break;
+ case 25:
+ CMVAR_RANGE_H = S25G_CMVAR_RANGE_H;
+ CMVAR_RANGE_L = S25G_CMVAR_RANGE_L;
+ lane_num = 1;
+ break;
+ default:
+ BP_LOG("%s %d :Invalid speed\n", __func__, __LINE__);
+ break;
+ }
+
+ /* 1. Read the temperature T just before RXS is enabled. */
+ txgbe_e56_get_temp(hw, &T);
+
+ /* 2. Define software variable RX_COARSE_MID_TD */
+ if (T < -5)
+ RX_COARSE_MID_TD = 10;
+ else if (T < 30)
+ RX_COARSE_MID_TD = 9;
+ else if (T < 65)
+ RX_COARSE_MID_TD = 8;
+ else if (T < 100)
+ RX_COARSE_MID_TD = 7;
+ else
+ RX_COARSE_MID_TD = 6;
+
+ for (lane_id = 0; lane_id < lane_num; lane_id++) {
+ addr = 0x0b4 + (0x200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 0, CMVAR_RANGE_H);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x08c + (0x200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 29, 29, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1540 + (0x02c * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 22, 22, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1530 + (0x02c * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 27, 27, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ }
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 19, 16, GENMASK(lane_num - 1, 0));
+ wr32_ephy(hw, 0x1400, rdata);
+ status |= kr_read_poll(rd32_ephy, rdata,
+ (((rdata & 0x3f3f3f3f) & GENMASK(8 * lane_num - 1, 0))
+ == (0x09090909 & GENMASK(8 * lane_num - 1, 0))),
+ 100, 2000, hw,
+ E56PHY_CTRL_FSM_RX_STAT_0_ADDR);
+ if (status)
+ BP_LOG("Wait fsm_rx_sts 1 = %x : %d, Wait rx_sts %s.\n",
+ rdata, status, status ? "FAILED" : "SUCCESS");
+
+ for (lane_id = 0; lane_id < lane_num; lane_id++) {
+ addr = 0x0b4 + (0x0200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ OFFSET_CENTRE_RANGE_H[lane_id] = (rdata >> 4) & 0xf;
+ if (OFFSET_CENTRE_RANGE_H[lane_id] > RX_COARSE_MID_TD)
+ OFFSET_CENTRE_RANGE_H[lane_id] = OFFSET_CENTRE_RANGE_H[lane_id] -
+ RX_COARSE_MID_TD;
+ else
+ OFFSET_CENTRE_RANGE_H[lane_id] = RX_COARSE_MID_TD -
+ OFFSET_CENTRE_RANGE_H[lane_id];
+ }
+
+ /* 7. Do SEQ::RX_DISABLE to disable RXS. */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 19, 16, 0x0);
+ wr32_ephy(hw, 0x1400, rdata);
+ status |= kr_read_poll(rd32_ephy, rdata,
+ (((rdata & 0x3f3f3f3f) & GENMASK(8 * lane_num - 1, 0))
+ == (0x21212121 & GENMASK(8 * lane_num - 1, 0))),
+ 100, 2000, hw,
+ E56PHY_CTRL_FSM_RX_STAT_0_ADDR);
+ if (status)
+ BP_LOG("Wait fsm_rx_sts 2 = %x : %d, Wait rx_sts %s.\n",
+ rdata, status, status ? "FAILED" : "SUCCESS");
+ rdata = rd32_ephy(hw, 0x15ec);
+ wr32_ephy(hw, 0x15ec, rdata);
+
+ for (lane_id = 0; lane_id < lane_num; lane_id++) {
+ addr = 0x0b4 + (0x200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 0, CMVAR_RANGE_L);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x08c + (0x200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 29, 29, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1540 + (0x02c * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 22, 22, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1530 + (0x02c * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 27, 27, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ }
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 19, 16, 0xf);
+ wr32_ephy(hw, 0x1400, rdata);
+ status |= kr_read_poll(rd32_ephy, rdata,
+ (((rdata & 0x3f3f3f3f) & GENMASK(8 * lane_num - 1, 0))
+ == (0x09090909 & GENMASK(8 * lane_num - 1, 0))),
+ 100, 2000, hw,
+ E56PHY_CTRL_FSM_RX_STAT_0_ADDR);
+ if (status)
+ BP_LOG("Wait fsm_rx_sts 3 = %x : %d, Wait rx_sts %s.\n",
+ rdata, status, status ? "FAILED" : "SUCCESS");
+ for (lane_id = 0; lane_id < lane_num; lane_id++) {
+ addr = 0x0b4 + (0x0200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ OFFSET_CENTRE_RANGE_L[lane_id] = (rdata >> 4) & 0xf;
+ if (OFFSET_CENTRE_RANGE_L[lane_id] > RX_COARSE_MID_TD)
+ OFFSET_CENTRE_RANGE_L[lane_id] = OFFSET_CENTRE_RANGE_L[lane_id] -
+ RX_COARSE_MID_TD;
+ else
+ OFFSET_CENTRE_RANGE_L[lane_id] = RX_COARSE_MID_TD -
+ OFFSET_CENTRE_RANGE_L[lane_id];
+ }
+ for (lane_id = 0; lane_id < lane_num; lane_id++) {
+ RANGE_FINAL[lane_id] = OFFSET_CENTRE_RANGE_L[lane_id] <
+ OFFSET_CENTRE_RANGE_H[lane_id] ?
+ CMVAR_RANGE_L : CMVAR_RANGE_H;
+ BP_LOG("lane_id:%d-RANGE_L:%x-RANGE_H:%x-RANGE_FINAL:%x\n",
+ lane_id, OFFSET_CENTRE_RANGE_L[lane_id],
+ OFFSET_CENTRE_RANGE_H[lane_id], RANGE_FINAL[lane_id]);
+ }
+
+ /* 7. Do SEQ::RX_DISABLE to disable RXS. */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 19, 16, 0x0);
+ wr32_ephy(hw, 0x1400, rdata);
+ status |= kr_read_poll(rd32_ephy, rdata,
+ (((rdata & 0x3f3f3f3f) & GENMASK(8 * lane_num - 1, 0))
+ == (0x21212121 & GENMASK(8 * lane_num - 1, 0))),
+ 100, 2000, hw,
+ E56PHY_CTRL_FSM_RX_STAT_0_ADDR);
+ if (status)
+ BP_LOG("Wait fsm_rx_sts 4 = %x : %d, Wait rx_sts %s.\n",
+ rdata, status, status ? "FAILED" : "SUCCESS");
+ rdata = rd32_ephy(hw, 0x15ec);
+ wr32_ephy(hw, 0x15ec, rdata);
+
+ for (lane_id = 0; lane_id < lane_num; lane_id++) {
+ addr = 0x0b4 + (0x0200 * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 0, RANGE_FINAL[lane_id]);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = 0x1544 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 25, 25, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1538 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 0, 0, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 28, 28, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1538 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 3, 3, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = 0x1544 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 16, 16, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1534 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 23, 23, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 17, 17, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1534 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 24, 24, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 31, 31, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1538 + (lane_id * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 6, 6, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1530 + (0x02c * lane_id);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 27, 27, 0x0);
+ wr32_ephy(hw, addr, rdata);
+ }
+
+ /* Do SEQ::RX_ENABLE */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_0_RX_EN_CFG, GENMASK(lane_num - 1, 0));
+ wr32_ephy(hw, 0x1400, rdata);
+
+ return status;
+}
+
+static int txgbe_e56_rxs_post_cdr_lock_temp_track_seq(struct txgbe_hw *hw,
+ u32 speed)
+{
+ int status = 0;
+ u32 rdata;
+ int SECOND_CODE;
+ int COARSE_CODE;
+ int FINE_CODE;
+ int ULTRAFINE_CODE;
+
+ int CMVAR_SEC_LOW_TH = 0;
+ int CMVAR_UFINE_MAX = 0;
+ int CMVAR_FINE_MAX = 0;
+ int CMVAR_UFINE_UMAX_WRAP = 0;
+ int CMVAR_COARSE_MAX = 0;
+ int CMVAR_UFINE_FMAX_WRAP = 0;
+ int CMVAR_FINE_FMAX_WRAP = 0;
+ int CMVAR_SEC_HIGH_TH = 0;
+ int CMVAR_UFINE_MIN = 0;
+ int CMVAR_FINE_MIN = 0;
+ int CMVAR_UFINE_UMIN_WRAP = 0;
+ int CMVAR_COARSE_MIN = 0;
+ int CMVAR_UFINE_FMIN_WRAP = 0;
+ int CMVAR_FINE_FMIN_WRAP = 0;
+
+ if (speed == 10) {
+ CMVAR_SEC_LOW_TH = S10G_CMVAR_SEC_LOW_TH;
+ CMVAR_UFINE_MAX = S10G_CMVAR_UFINE_MAX;
+ CMVAR_FINE_MAX = S10G_CMVAR_FINE_MAX;
+ CMVAR_UFINE_UMAX_WRAP = S10G_CMVAR_UFINE_UMAX_WRAP;
+ CMVAR_COARSE_MAX = S10G_CMVAR_COARSE_MAX;
+ CMVAR_UFINE_FMAX_WRAP = S10G_CMVAR_UFINE_FMAX_WRAP;
+ CMVAR_FINE_FMAX_WRAP = S10G_CMVAR_FINE_FMAX_WRAP;
+ CMVAR_SEC_HIGH_TH = S10G_CMVAR_SEC_HIGH_TH;
+ CMVAR_UFINE_MIN = S10G_CMVAR_UFINE_MIN;
+ CMVAR_FINE_MIN = S10G_CMVAR_FINE_MIN;
+ CMVAR_UFINE_UMIN_WRAP = S10G_CMVAR_UFINE_UMIN_WRAP;
+ CMVAR_COARSE_MIN = S10G_CMVAR_COARSE_MIN;
+ CMVAR_UFINE_FMIN_WRAP = S10G_CMVAR_UFINE_FMIN_WRAP;
+ CMVAR_FINE_FMIN_WRAP = S10G_CMVAR_FINE_FMIN_WRAP;
+ } else if (speed == 25) {
+ CMVAR_SEC_LOW_TH = S25G_CMVAR_SEC_LOW_TH;
+ CMVAR_UFINE_MAX = S25G_CMVAR_UFINE_MAX;
+ CMVAR_FINE_MAX = S25G_CMVAR_FINE_MAX;
+ CMVAR_UFINE_UMAX_WRAP = S25G_CMVAR_UFINE_UMAX_WRAP;
+ CMVAR_COARSE_MAX = S25G_CMVAR_COARSE_MAX;
+ CMVAR_UFINE_FMAX_WRAP = S25G_CMVAR_UFINE_FMAX_WRAP;
+ CMVAR_FINE_FMAX_WRAP = S25G_CMVAR_FINE_FMAX_WRAP;
+ CMVAR_SEC_HIGH_TH = S25G_CMVAR_SEC_HIGH_TH;
+ CMVAR_UFINE_MIN = S25G_CMVAR_UFINE_MIN;
+ CMVAR_FINE_MIN = S25G_CMVAR_FINE_MIN;
+ CMVAR_UFINE_UMIN_WRAP = S25G_CMVAR_UFINE_UMIN_WRAP;
+ CMVAR_COARSE_MIN = S25G_CMVAR_COARSE_MIN;
+ CMVAR_UFINE_FMIN_WRAP = S25G_CMVAR_UFINE_FMIN_WRAP;
+ CMVAR_FINE_FMIN_WRAP = S25G_CMVAR_FINE_FMIN_WRAP;
+ }
+
+ txgbe_e56_rx_rd_second_code(hw, &SECOND_CODE);
+
+ EPHY_RREG(E56G__RXS0_ANA_OVRDVAL_5);
+ COARSE_CODE = EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_coarse_i);
+ FINE_CODE = EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_fine_i);
+ ULTRAFINE_CODE = EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_ultrafine_i);
+
+ if (SECOND_CODE <= CMVAR_SEC_LOW_TH) {
+ if (ULTRAFINE_CODE < CMVAR_UFINE_MAX) {
+ txgbe_e56_ephy_config(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_ultrafine_i,
+ ULTRAFINE_CODE + 1);
+ EPHY_RREG(E56G__RXS0_ANA_OVRDEN_1);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_ultrafine_i) = 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDEN_1);
+ } else if (FINE_CODE < CMVAR_FINE_MAX) {
+ EPHY_RREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5,
+ ana_bbcdr_ultrafine_i) = CMVAR_UFINE_UMAX_WRAP;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_fine_i) = FINE_CODE + 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_RREG(E56G__RXS0_ANA_OVRDEN_1);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_fine_i) = 1;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_ultrafine_i) = 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDEN_1);
+ } else if (COARSE_CODE < CMVAR_COARSE_MAX) {
+ EPHY_RREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5,
+ ana_bbcdr_ultrafine_i) = CMVAR_UFINE_FMAX_WRAP;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5,
+ ana_bbcdr_fine_i) = CMVAR_FINE_FMAX_WRAP;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_coarse_i) = COARSE_CODE + 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_RREG(E56G__RXS0_ANA_OVRDEN_1);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_coarse_i) = 1;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_fine_i) = 1;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_ultrafine_i) = 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDEN_1);
+ } else {
+ BP_LOG("ERROR: (SECOND_CODE <= CMVAR_SEC_LOW_TH) temperature tracking occurs Error condition\n");
+ }
+ } else if (SECOND_CODE >= CMVAR_SEC_HIGH_TH) {
+ if (ULTRAFINE_CODE > CMVAR_UFINE_MIN) {
+ txgbe_e56_ephy_config(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_ultrafine_i,
+ ULTRAFINE_CODE - 1);
+ EPHY_RREG(E56G__RXS0_ANA_OVRDEN_1);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_ultrafine_i) = 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDEN_1);
+ } else if (FINE_CODE > CMVAR_FINE_MIN) {
+ EPHY_RREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5,
+ ana_bbcdr_ultrafine_i) = CMVAR_UFINE_UMIN_WRAP;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_fine_i) = FINE_CODE - 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_RREG(E56G__RXS0_ANA_OVRDEN_1);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_fine_i) = 1;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_ultrafine_i) = 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDEN_1);
+ } else if (COARSE_CODE > CMVAR_COARSE_MIN) {
+ EPHY_RREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5,
+ ana_bbcdr_ultrafine_i) = CMVAR_UFINE_FMIN_WRAP;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5,
+ ana_bbcdr_fine_i) = CMVAR_FINE_FMIN_WRAP;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDVAL_5, ana_bbcdr_coarse_i) = COARSE_CODE - 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDVAL_5);
+ EPHY_RREG(E56G__RXS0_ANA_OVRDEN_1);
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_coarse_i) = 1;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_fine_i) = 1;
+ EPHY_XFLD(E56G__RXS0_ANA_OVRDEN_1, ovrd_en_ana_bbcdr_ultrafine_i) = 1;
+ EPHY_WREG(E56G__RXS0_ANA_OVRDEN_1);
+ } else {
+ BP_LOG("ERROR: (SECOND_CODE >= CMVAR_SEC_HIGH_TH) temperature tracking occurs Error condition\n");
+ }
+ }
+
+ return status;
+}
+
+static int txgbe_e56_ctle_bypass_seq(struct txgbe_hw *hw, u8 bp_link_mode)
+{
+ u32 rdata;
+
+ txgbe_e56_ephy_config(E56G__RXS0_ANA_OVRDVAL_0, ana_ctle_bypass_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS0_ANA_OVRDEN_0, ovrd_en_ana_ctle_bypass_i, 1);
+
+ txgbe_e56_ephy_config(E56G__RXS0_ANA_OVRDVAL_3, ana_ctle_cz_cstm_i, 0);
+ txgbe_e56_ephy_config(E56G__RXS0_ANA_OVRDEN_0, ovrd_en_ana_ctle_cz_cstm_i, 1);
+
+ EPHY_RREG(E56G__PMD_RXS0_OVRDVAL_1);
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDVAL_1, rxs0_rx0_ctle_train_en_i) = 0;
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDVAL_1, rxs0_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS0_OVRDVAL_1);
+
+ EPHY_RREG(E56G__PMD_RXS0_OVRDEN_1);
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDEN_1, ovrd_en_rxs0_rx0_ctle_train_en_i) = 1;
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDEN_1, ovrd_en_rxs0_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS0_OVRDEN_1);
+
+ if (bp_link_mode == 40) {
+ txgbe_e56_ephy_config(E56G__RXS1_ANA_OVRDVAL_0, ana_ctle_bypass_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS1_ANA_OVRDEN_0, ovrd_en_ana_ctle_bypass_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS2_ANA_OVRDVAL_0, ana_ctle_bypass_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS2_ANA_OVRDEN_0, ovrd_en_ana_ctle_bypass_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS3_ANA_OVRDVAL_0, ana_ctle_bypass_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS3_ANA_OVRDEN_0, ovrd_en_ana_ctle_bypass_i, 1);
+
+ txgbe_e56_ephy_config(E56G__RXS1_ANA_OVRDVAL_3, ana_ctle_cz_cstm_i, 0);
+ txgbe_e56_ephy_config(E56G__RXS1_ANA_OVRDEN_0, ovrd_en_ana_ctle_cz_cstm_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS2_ANA_OVRDVAL_3, ana_ctle_cz_cstm_i, 0);
+ txgbe_e56_ephy_config(E56G__RXS2_ANA_OVRDEN_0, ovrd_en_ana_ctle_cz_cstm_i, 1);
+ txgbe_e56_ephy_config(E56G__RXS3_ANA_OVRDVAL_3, ana_ctle_cz_cstm_i, 0);
+ txgbe_e56_ephy_config(E56G__RXS3_ANA_OVRDEN_0, ovrd_en_ana_ctle_cz_cstm_i, 1);
+
+ EPHY_RREG(E56G__PMD_RXS1_OVRDVAL_1);
+ EPHY_XFLD(E56G__PMD_RXS1_OVRDVAL_1, rxs1_rx0_ctle_train_en_i) = 0;
+ EPHY_XFLD(E56G__PMD_RXS1_OVRDVAL_1, rxs1_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS1_OVRDVAL_1);
+ EPHY_RREG(E56G__PMD_RXS2_OVRDVAL_1);
+ EPHY_XFLD(E56G__PMD_RXS2_OVRDVAL_1, rxs2_rx0_ctle_train_en_i) = 0;
+ EPHY_XFLD(E56G__PMD_RXS2_OVRDVAL_1, rxs2_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS2_OVRDVAL_1);
+ EPHY_RREG(E56G__PMD_RXS3_OVRDVAL_1);
+ EPHY_XFLD(E56G__PMD_RXS3_OVRDVAL_1, rxs3_rx0_ctle_train_en_i) = 0;
+ EPHY_XFLD(E56G__PMD_RXS3_OVRDVAL_1, rxs3_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS3_OVRDVAL_1);
+
+ EPHY_RREG(E56G__PMD_RXS1_OVRDEN_1);
+ EPHY_XFLD(E56G__PMD_RXS1_OVRDEN_1, ovrd_en_rxs1_rx0_ctle_train_en_i) = 1;
+ EPHY_XFLD(E56G__PMD_RXS1_OVRDEN_1, ovrd_en_rxs1_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS1_OVRDEN_1);
+ EPHY_RREG(E56G__PMD_RXS2_OVRDEN_1);
+ EPHY_XFLD(E56G__PMD_RXS2_OVRDEN_1, ovrd_en_rxs2_rx0_ctle_train_en_i) = 1;
+ EPHY_XFLD(E56G__PMD_RXS2_OVRDEN_1, ovrd_en_rxs2_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS2_OVRDEN_1);
+ EPHY_RREG(E56G__PMD_RXS3_OVRDEN_1);
+ EPHY_XFLD(E56G__PMD_RXS3_OVRDEN_1, ovrd_en_rxs3_rx0_ctle_train_en_i) = 1;
+ EPHY_XFLD(E56G__PMD_RXS3_OVRDEN_1, ovrd_en_rxs3_rx0_ctle_train_done_o) = 1;
+ EPHY_WREG(E56G__PMD_RXS3_OVRDEN_1);
+ }
+ return 0;
+}
+
+static int txgbe_e56_rxs_adc_adapt_seq(struct txgbe_hw *hw, u32 bypass_ctle)
+{
+ int lane_num = 0, lane_idx = 0;
+ u32 rdata = 0, addr = 0;
+ int status = 0;
+
+ int timer = 0, j = 0;
+
+ switch (hw->bp_link_mode) {
+ case 10:
+ lane_num = 1;
+ break;
+ case 40:
+ lane_num = 4;
+ break;
+ case 25:
+ lane_num = 1;
+ break;
+ default:
+ BP_LOG("%s %d :Invalid speed\n", __func__, __LINE__);
+ break;
+ }
+
+ for (lane_idx = 0; lane_idx < lane_num; lane_idx++) {
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ /* Wait RXS0-3_OVRDVAL[1]::rxs0-3_rx0_cdr_rdy_o = 1 */
+ status = kr_read_poll(rd32_ephy, rdata, (rdata & BIT(12)),
+ 100, 2000, hw, 0x1544);
+ if (status)
+ BP_LOG("rxs%d_rx0_cdr_rdy_o = %x, %s.\n",
+ lane_idx, rdata,
+ status ? "FAILED" : "SUCCESS");
+ }
+
+ for (lane_idx = 0; lane_idx < lane_num; lane_idx++) {
+ /* 4. Disable VGA and CTLE training so they don't interfere with ADC calibration */
+ /* a. Set ALIAS::RXS::VGA_TRAIN_EN = 0b0 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 7, 7, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1534 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 14, 14, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ /* b. Set ALIAS::RXS::CTLE_TRAIN_EN = 0b0 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 9, 9, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1534 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 16, 16, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ /* 5. Perform ADC interleaver calibration */
+ /* a. Remove the OVERRIDE on ALIAS::RXS::ADC_INTL_CAL_DONE */
+ addr = 0x1534 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 24, 24, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 16, 16, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ /* Wait rxs0_rx0_adc_intl_cal_done_o bit17 = 1 */
+ status = kr_read_poll(rd32_ephy, rdata, (rdata & BIT(17)),
+ 100, 2000, hw, addr);
+ if (status)
+ BP_LOG("rxs0_rx0_adc_intl_cal_done_o = %x, %s.\n", rdata,
+ status ? "FAILED" : "SUCCESS");
+
+ /* 6. Perform ADC offset adaptation and ADC gain adaptation,
+ * repeat them a few times and after that keep it disabled.
+ */
+ for (j = 0; j < 16; j++) {
+ /* a. ALIAS::RXS::ADC_OFST_ADAPT_EN = 0b1 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 25, 25, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ /* b. Wait for 1ms or greater */
+ /* usec_delay(1000); */
+ /* set ovrd_en_rxs0_rx0_adc_ofst_adapt_done_o bit1=0 */
+ addr = 0x1538 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 1, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ /* Wait rxs0_rx0_adc_ofst_adapt_done_o bit26 = 0 */
+ status = kr_read_poll(rd32_ephy, rdata,
+ !(rdata & BIT(26)),
+ 100, 2000, hw, addr);
+ if (status)
+ BP_LOG("rxs0_rx0_adc_ofst_adapt_done_o %d = %x, %s.\n",
+ j, rdata, status ? "FAILED" : "SUCCESS");
+
+ /* c. ALIAS::RXS::ADC_OFST_ADAPT_EN = 0b0 */
+ rdata = 0x0000;
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 25, 25, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ /* d. ALIAS::RXS::ADC_GAIN_ADAPT_EN = 0b1 */
+ rdata = 0x0000;
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 28, 28, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ /* e. Wait for 1ms or greater */
+ /* usec_delay(1000); */
+ /* set ovrd_en_rxs0_rx0_adc_ofst_adapt_done_o bit1=0 */
+ addr = 0x1538 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 1, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ /* Wait rxs0_rx0_adc_gain_adapt_done_o bit29 = 0 */
+ status = kr_read_poll(rd32_ephy, rdata, !(rdata & BIT(29)),
+ 100, 2000, hw, addr);
+ if (status)
+ BP_LOG("rxs0_rx0_adc_gain_adapt_done_o %d = %x, %s.\n",
+ j, rdata, status ? "FAILED" : "SUCCESS");
+
+ /* f. ALIAS::RXS::ADC_GAIN_ADAPT_EN = 0b0 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 28, 28, 0x0);
+ wr32_ephy(hw, addr, rdata);
+ }
+ /* g. Repeat #a to #f total 16 times */
+
+ /* 7. Perform ADC interleaver adaptation for 10ms or greater,
+ * and after that disable it
+ */
+ /* a. ALIAS::RXS::ADC_INTL_ADAPT_EN = 0b1 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 31, 31, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ /* b. Wait for 10ms or greater */
+ msleep(20);
+
+ /* c. ALIAS::RXS::ADC_INTL_ADAPT_EN = 0b0 */
+ /* set ovrd_en_rxs0_rx0_adc_intl_adapt_en_i=0 */
+ addr = 0x1538 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 6, 6, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ /* 8. Now re-enable VGA and CTLE trainings, so that it continues
+ * to adapt tracking changes in temperature or voltage
+ * <1>Set ALIAS::RXS::VGA_TRAIN_EN = 0b1
+ */
+ /* set rxs0_rx0_vga_train_en_i=1 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 7, 7, 0x1);
+ if (bypass_ctle == 0)
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDVAL_1, rxs0_rx0_ctle_train_en_i) = 1;
+ wr32_ephy(hw, addr, rdata);
+
+ /* <2>wait for ALIAS::RXS::VGA_TRAIN_DONE = 1 */
+ /* set ovrd_en_rxs0_rx0_vga_train_done_o = 0 */
+ addr = 0x1534 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 15, 15, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ /* Wait rxs0_rx0_vga_train_done_o bit8 = 0 */
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ status = kr_read_poll(rd32_ephy, rdata, (rdata & BIT(8)),
+ 100, 3000, hw, addr);
+ if (status)
+ BP_LOG("rxs0_rx0_vga_train_done_o = %x, %s.\n", rdata,
+ status ? "FAILED" : "SUCCESS");
+
+ if (bypass_ctle == 0) {
+ addr = 0x1534 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDEN_1,
+ ovrd_en_rxs0_rx0_ctle_train_done_o) = 0;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0;
+ timer = 0;
+ addr = 0x1544 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ while (EPHY_XFLD(E56G__PMD_RXS0_OVRDVAL_1,
+ rxs0_rx0_ctle_train_done_o) != 1) {
+ rdata = rd32_ephy(hw, addr);
+ usec_delay(500);
+
+ if (timer++ > PHYINIT_TIMEOUT)
+ break;
+ }
+ }
+
+ /* a. Remove the OVERRIDE on ALIAS::RXS::VGA_TRAIN_EN */
+ addr = 0x1534 + (E56PHY_PMD_RX_OFFSET * lane_idx);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 15, 15, 0);
+ /* b. Remove the OVERRIDE on ALIAS::RXS::CTLE_TRAIN_EN */
+ if (bypass_ctle == 0)
+ EPHY_XFLD(E56G__PMD_RXS0_OVRDEN_1,
+ ovrd_en_rxs0_rx0_ctle_train_en_i) = 0;
+ wr32_ephy(hw, addr, rdata);
+ }
+
+ return status;
+}
+
+static int txgbe_e56_phy_rxs_calib_adapt_seq(struct txgbe_hw *hw,
+ u8 bp_link_mode, u32 bypass_ctle)
+{
+ int lane_num = 0, lane_idx = 0;
+ int status = 0;
+ u32 rdata, addr;
+
+ switch (bp_link_mode) {
+ case 10:
+ lane_num = 1;
+ break;
+ case 40:
+ lane_num = 4;
+ break;
+ case 25:
+ lane_num = 1;
+ break;
+ default:
+ BP_LOG("%s %d :Invalid speed\n", __func__, __LINE__);
+ break;
+ }
+
+ for (lane_idx = 0; lane_idx < lane_num; lane_idx++) {
+ rdata = 0x0000;
+ addr = 0x1544 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 25, 25, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1538 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 0, 0, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 28, 28, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1538 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 3, 3, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = 0x1544 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 16, 16, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1534 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 23, 23, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 17, 17, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1534 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 24, 24, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1544 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 31, 31, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x1538 + (lane_idx * E56PHY_PMD_RX_OFFSET);
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 6, 6, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ }
+ if (bypass_ctle != 0)
+ status |= txgbe_e56_ctle_bypass_seq(hw, bp_link_mode);
+
+ status |= txgbe_e56_rxs_osc_init_for_temp_track_range(hw, bp_link_mode);
+
+ /* Wait an fsm_rx_sts 25G */
+ BP_LOG("Wait CTRL_FSM_RX_STAT[0]::ctrl_fsm_rx0_st to be ready ...\n");
+
+ status |= kr_read_poll(rd32_ephy, rdata,
+ (((rdata & 0x3f3f3f3f) & GENMASK(8 * lane_num - 1, 0))
+ == (0x1b1b1b1b & GENMASK(8 * lane_num - 1, 0))),
+ 1000, 300, hw,
+ E56PHY_CTRL_FSM_RX_STAT_0_ADDR);
+ BP_LOG("wait ctrl_fsm_rx0_st = %x, %s.\n",
+ rdata, status ? "FAILED" : "SUCCESS");
+
+ return status;
+}
+
+static int txgbe_e56_cms_cfg_for_temp_track_range(struct txgbe_hw *hw)
+{
+ int status = 0, T = 40;
+ u32 addr, rdata;
+
+ status = txgbe_e56_get_temp(hw, &T);
+ if (T < 40) {
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDEN_0_OVRD_EN_ANA_LCPLL_HF_LPF_SETCODE_CALIB_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDVAL_2_ANA_LCPLL_HF_LPF_SETCODE_CALIB_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDEN_1_OVRD_EN_ANA_LCPLL_LF_LPF_SETCODE_CALIB_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_7_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDVAL_7_ANA_LCPLL_LF_LPF_SETCODE_CALIB_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ } else if (T > 70) {
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDEN_0_OVRD_EN_ANA_LCPLL_HF_LPF_SETCODE_CALIB_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDVAL_2_ANA_LCPLL_HF_LPF_SETCODE_CALIB_I, 0x3);
+ wr32_ephy(hw, addr, rdata);
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDEN_1_OVRD_EN_ANA_LCPLL_LF_LPF_SETCODE_CALIB_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_7_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDVAL_7_ANA_LCPLL_LF_LPF_SETCODE_CALIB_I, 0x3);
+ wr32_ephy(hw, addr, rdata);
+ } else {
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CMS_ANA_OVRDEN_1_OVRD_EN_ANA_LCPLL_HF_TEST_IN_I,
+ 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 24, 24, 0x1);
+ set_fields_e56(&rdata, 31, 29, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_5_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 0, 0x0);
+ wr32_ephy(hw, addr, rdata);
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CMS_ANA_OVRDEN_1_OVRD_EN_ANA_LCPLL_LF_TEST_IN_I,
+ 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_9_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 24, 24, 0x1);
+ set_fields_e56(&rdata, 31, 29, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_10_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 1, 0, 0x0);
+ wr32_ephy(hw, addr, rdata);
+ }
+ return status;
+}
+
+static int txgbe_e56_bp_cfg_25g(struct txgbe_hw *hw)
+{
+ u32 addr, rdata;
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_PIN_OVRDVAL_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CMS_PIN_OVRDVAL_0_INT_PLL0_TX_SIGNAL_TYPE_I, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_PIN_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CMS_PIN_OVRDEN_0_OVRD_EN_PLL0_TX_SIGNAL_TYPE_I,
+ 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CMS_ANA_OVRDVAL_2_ANA_LCPLL_HF_VCO_SWING_CTRL_I,
+ 0xf);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata,
+ E56PHY_CMS_ANA_OVRDEN_0_OVRD_EN_ANA_LCPLL_HF_VCO_SWING_CTRL_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDVAL_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 23, 0, 0x260000);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CMS_ANA_OVRDEN_1_OVRD_EN_ANA_LCPLL_HF_TEST_IN_I,
+ 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_TXS_CFG_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_TXS_CFG_1_ADAPTATION_WAIT_CNT_X256, 0xf);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_WKUP_CNT_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_WKUP_CNTLDO_WKUP_CNT_X32, 0xff);
+ set_fields_e56(&rdata, E56PHY_TXS_WKUP_CNTDCC_WKUP_CNT_X32, 0xff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_PIN_OVRDVAL_6_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 27, 24, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_PIN_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_PIN_OVRDEN_0_OVRD_EN_TX0_EFUSE_BITS_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_ANA_OVRDVAL_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_ANA_OVRDVAL_1_ANA_TEST_DAC_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_ANA_OVRDEN_0_OVRD_EN_ANA_TEST_DAC_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ txgbe_e56_tx_ffe_cfg(hw, TXGBE_LINK_SPEED_25GB_FULL);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_RXS_CFG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_RXS_CFG_0_DSER_DATA_SEL, 0x0);
+ set_fields_e56(&rdata, E56PHY_RXS_RXS_CFG_0_TRAIN_CLK_GATE_BYPASS_EN, 0x1fff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_1_PREDIV1, 0x700);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_1_TARGET_CNT1, 0x2418);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_4_OSC_RANGE_SEL1, 0x1);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_4_VCO_CODE_INIT, 0x7fb);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_4_OSC_CURRENT_BOOST_EN1, 0x0);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_4_BBCDR_CURRENT_BOOST1, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_5_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_SDM_WIDTH, 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_PROP_STEP_PRELOCK,
+ 0xf);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_PROP_STEP_POSTLOCK,
+ 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_GAIN_CTRL_POSTLOCK,
+ 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_GAIN_CTRL_PRELOCK,
+ 0xf);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BBCDR_RDY_CNT, 0x3);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_6_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_6_PI_GAIN_CTRL_PRELOCK, 0x7);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_6_PI_GAIN_CTRL_POSTLOCK, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_INTL_CONFIG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_INTL_CONFIG_0_ADC_INTL2SLICE_DELAY1, 0x3333);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_INTL_CONFIG_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_INTL_CONFIG_2_INTERLEAVER_HBW_DISABLE1, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_0_ADC_DATA_PEAK_LTH, 0x56);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_0_ADC_DATA_PEAK_UTH, 0x6a);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_1_C1_LTH, 0x1f8);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_1_C1_UTH, 0xf0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_2_CM1_LTH, 0x100);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_2_CM1_UTH, 0xff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_3_CM2_LTH, 0x4);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_3_CM2_UTH, 0x37);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_3_TXFFE_TRAIN_MOD_TYPE, 0x38);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56G__RXS0_FOM_18__ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56G__RXS0_FOM_18__DFE_COEFFL_HINT__MSB,
+ E56G__RXS0_FOM_18__DFE_COEFFL_HINT__LSB, 0x0);
+ set_fields_e56(&rdata, E56G__RXS0_FOM_18__DFE_COEFFH_HINT__MSB,
+ E56G__RXS0_FOM_18__DFE_COEFFH_HINT__LSB, 0x0);
+ set_fields_e56(&rdata, E56G__RXS0_FOM_18__DFE_COEFF_HINT_LOAD__MSB,
+ E56G__RXS0_FOM_18__DFE_COEFF_HINT_LOAD__LSB, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_VGA_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_0_VGA_TARGET, 0x34);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_VGA_TRAINING_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA1_CODE_INIT0, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA2_CODE_INIT0, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA1_CODE_INIT123, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA2_CODE_INIT123, 0xa);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_0_CTLE_CODE_INIT0, 0x9);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_0_CTLE_CODE_INIT123, 0x9);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_1_LFEQ_LUT, 0x1ffffea);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_2_ISI_TH_FRAC_P1, 18);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_2_ISI_TH_FRAC_P2, 0);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_2_ISI_TH_FRAC_P3, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_3_TAP_WEIGHT_P1, 1);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_3_TAP_WEIGHT_P2, 0);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_3_TAP_WEIGHT_P3, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OFFSET_N_GAIN_CAL_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_0_ADC_SLICE_DATA_AVG_CNT,
+ 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_0_ADC_DATA_AVG_CNT, 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_0_FE_OFFSET_DAC_CLK_CNT_X8,
+ 0xc);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OFFSET_N_GAIN_CAL_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_1_SAMP_ADAPT_CFG, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_FFE_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_FFE_TRAINING_0_FFE_TAP_EN, 0xf9ff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_IDLE_DETECT_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_IDLE_DETECT_1_IDLE_TH_ADC_PEAK_MAX, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_IDLE_DETECT_1_IDLE_TH_ADC_PEAK_MIN, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x6cc;
+ rdata = 0x8020000;
+ wr32_ephy(hw, addr, rdata);
+ addr = 0x94;
+ rdata = 0;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDVAL_0_ANA_EN_RTERM_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_0_OVRD_EN_ANA_EN_RTERM_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_6_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 4, 0, 0x0);
+ set_fields_e56(&rdata, 14, 13, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_1_OVRD_EN_ANA_BBCDR_VCOFILT_BYP_I,
+ 0x1);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_1_OVRD_EN_ANA_TEST_BBCDR_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_15_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 2, 0, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_17_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDVAL_17_ANA_VGA2_BOOST_CSTM_I, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_3_OVRD_EN_ANA_ANABS_CONFIG_I, 0x1);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_3_OVRD_EN_ANA_VGA2_BOOST_CSTM_I,
+ 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_14_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 13, 13, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 13, 13, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_EYE_SCAN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_EYE_SCAN_1_EYE_SCAN_REF_TIMER, 0x400);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_RINGO_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 21, 12, 0x366);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_PMD_CFG_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_3_CTRL_FSM_TIMEOUT_X64K, 0x80);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_PMD_CFG_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_4_TRAIN_DC_ON_PERIOD_X64K, 0x18);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_4_TRAIN_DC_PERIOD_X512K, 0x3e);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_PMD_CFG_5_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_5_USE_RECENT_MARKER_OFFSET, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_0_CONT_ON_ADC_GAIN_CAL_ERR, 0x1);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_0_DO_RX_ADC_OFST_CAL, 0x3);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_0_RX_ERR_ACTION_EN, 0x40);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST0_WAIT_CNT_X4096, 0xff);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST1_WAIT_CNT_X4096, 0xff);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST2_WAIT_CNT_X4096, 0xff);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST3_WAIT_CNT_X4096, 0xff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST4_WAIT_CNT_X4096, 0x1);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST5_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST6_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST7_WAIT_CNT_X4096, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST8_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST9_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST10_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST11_WAIT_CNT_X4096, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST12_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST13_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST14_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST15_WAIT_CNT_X4096, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_7_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_7_TRAIN_ST4_EN, 0x4bf);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_7_TRAIN_ST5_EN, 0xc4bf);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_8_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_8_TRAIN_ST7_EN, 0x47ff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_12_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_12_TRAIN_ST15_EN, 0x67ff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_13_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_13_TRAIN_ST0_DONE_EN, 0x8001);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_13_TRAIN_ST1_DONE_EN, 0x8002);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_14_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_14_TRAIN_ST3_DONE_EN, 0x8008);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_15_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_15_TRAIN_ST4_DONE_EN, 0x8004);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_17_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_17_TRAIN_ST8_DONE_EN, 0x20c0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_18_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_18_TRAIN_ST10_DONE_EN, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_29_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_29_TRAIN_ST15_DC_EN, 0x3f6d);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_33_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_33_TRAIN0_RATE_SEL, 0x8000);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_33_TRAIN1_RATE_SEL, 0x8000);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_34_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_34_TRAIN2_RATE_SEL, 0x8000);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_34_TRAIN3_RATE_SEL, 0x8000);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_KRT_TFSM_CFG_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_KRT_TFSM_CFGKRT_TFSM_MAX_WAIT_TIMER_X1000K, 0x49);
+ set_fields_e56(&rdata, E56PHY_KRT_TFSM_CFGKRT_TFSM_MAX_WAIT_TIMER_X8000K, 0x37);
+ set_fields_e56(&rdata, E56PHY_KRT_TFSM_CFGKRT_TFSM_HOLDOFF_TIMER_X256K, 0x2f);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_FETX_FFE_TRAIN_CFG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_FETX_FFE_TRAIN_CFG_0_KRT_FETX_INIT_FFE_CFG_2,
+ 0x2);
+ wr32_ephy(hw, addr, rdata);
+
+ return 0;
+}
+
+static int txgbe_e56_bp_cfg_10g(struct txgbe_hw *hw)
+{
+ u32 addr, rdata;
+
+ rdata = 0x0000;
+ addr = E56G__CMS_ANA_OVRDVAL_7_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G__CMS_ANA_OVRDVAL_7 *)&rdata)->ana_lcpll_lf_vco_swing_ctrl_i = 0xf;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56G__CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G__CMS_ANA_OVRDEN_1 *)&rdata)->ovrd_en_ana_lcpll_lf_vco_swing_ctrl_i = 0x1;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56G__CMS_ANA_OVRDVAL_9_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 23, 0, 0x260000);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56G__CMS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G__CMS_ANA_OVRDEN_1 *)&rdata)->ovrd_en_ana_lcpll_lf_test_in_i = 0x1;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_TXS_CFG_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_TXS_CFG_1_ADAPTATION_WAIT_CNT_X256, 0xf);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_WKUP_CNT_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_WKUP_CNTLDO_WKUP_CNT_X32, 0xff);
+ set_fields_e56(&rdata, E56PHY_TXS_WKUP_CNTDCC_WKUP_CNT_X32, 0xff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_PIN_OVRDVAL_6_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 19, 16, 0x6);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_PIN_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_PIN_OVRDEN_0_OVRD_EN_TX0_EFUSE_BITS_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_ANA_OVRDVAL_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_ANA_OVRDVAL_1_ANA_TEST_DAC_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_TXS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_TXS_ANA_OVRDEN_0_OVRD_EN_ANA_TEST_DAC_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ txgbe_e56_tx_ffe_cfg(hw, TXGBE_LINK_SPEED_10GB_FULL);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_RXS_CFG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_RXS_CFG_0_DSER_DATA_SEL, 0x0);
+ set_fields_e56(&rdata, E56PHY_RXS_RXS_CFG_0_TRAIN_CLK_GATE_BYPASS_EN, 0x1fff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G_RXS0_OSC_CAL_N_CDR_0 *)&rdata)->prediv0 = 0xfa0;
+ ((E56G_RXS0_OSC_CAL_N_CDR_0 *)&rdata)->target_cnt0 = 0x203a;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G_RXS0_OSC_CAL_N_CDR_4 *)&rdata)->osc_range_sel0 = 0x2;
+ ((E56G_RXS0_OSC_CAL_N_CDR_4 *)&rdata)->vco_code_init = 0x7ff;
+ ((E56G_RXS0_OSC_CAL_N_CDR_4 *)&rdata)->osc_current_boost_en0 = 0x1;
+ ((E56G_RXS0_OSC_CAL_N_CDR_4 *)&rdata)->bbcdr_current_boost0 = 0x0;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_5_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_SDM_WIDTH, 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_PROP_STEP_PRELOCK,
+ 0xf);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_PROP_STEP_POSTLOCK,
+ 0xf);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_GAIN_CTRL_POSTLOCK,
+ 0xc);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BB_CDR_GAIN_CTRL_PRELOCK,
+ 0xf);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_5_BBCDR_RDY_CNT, 0x3);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OSC_CAL_N_CDR_6_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_6_PI_GAIN_CTRL_PRELOCK, 0x7);
+ set_fields_e56(&rdata, E56PHY_RXS_OSC_CAL_N_CDR_6_PI_GAIN_CTRL_POSTLOCK, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_INTL_CONFIG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G_RXS0_INTL_CONFIG_0 *)&rdata)->adc_intl2slice_delay0 = 0x5555;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_INTL_CONFIG_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ ((E56G_RXS0_INTL_CONFIG_2 *)&rdata)->interleaver_hbw_disable0 = 0x1;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_0_ADC_DATA_PEAK_LTH, 0x56);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_0_ADC_DATA_PEAK_UTH, 0x6a);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_1_C1_LTH, 0x1e8);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_1_C1_UTH, 0x78);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_2_CM1_LTH, 0x100);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_2_CM1_UTH, 0xff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_TXFFE_TRAINING_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_3_CM2_LTH, 0x4);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_3_CM2_UTH, 0x37);
+ set_fields_e56(&rdata, E56PHY_RXS_TXFFE_TRAINING_3_TXFFE_TRAIN_MOD_TYPE, 0x38);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_VGA_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_0_VGA_TARGET, 0x34);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_VGA_TRAINING_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA1_CODE_INIT0, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA2_CODE_INIT0, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA1_CODE_INIT123, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_VGA_TRAINING_1_VGA2_CODE_INIT123, 0xa);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_0_CTLE_CODE_INIT0, 0x9);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_0_CTLE_CODE_INIT123, 0x9);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_1_LFEQ_LUT, 0x1ffffea);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_2_ISI_TH_FRAC_P1, 0x18);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_2_ISI_TH_FRAC_P2, 0);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_2_ISI_TH_FRAC_P3, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_CTLE_TRAINING_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_3_TAP_WEIGHT_P1, 1);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_3_TAP_WEIGHT_P2, 0);
+ set_fields_e56(&rdata, E56PHY_RXS_CTLE_TRAINING_3_TAP_WEIGHT_P3, 0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OFFSET_N_GAIN_CAL_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_0_ADC_SLICE_DATA_AVG_CNT,
+ 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_0_ADC_DATA_AVG_CNT, 0x3);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_0_FE_OFFSET_DAC_CLK_CNT_X8,
+ 0xc);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_OFFSET_N_GAIN_CAL_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_OFFSET_N_GAIN_CAL_1_SAMP_ADAPT_CFG, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_FFE_TRAINING_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_FFE_TRAINING_0_FFE_TAP_EN, 0xf9ff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_IDLE_DETECT_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_IDLE_DETECT_1_IDLE_TH_ADC_PEAK_MAX, 0xa);
+ set_fields_e56(&rdata, E56PHY_RXS_IDLE_DETECT_1_IDLE_TH_ADC_PEAK_MIN, 0x5);
+ wr32_ephy(hw, addr, rdata);
+
+ addr = 0x6cc;
+ rdata = 0x8020000;
+ wr32_ephy(hw, addr, rdata);
+ addr = 0x94;
+ rdata = 0;
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDVAL_0_ANA_EN_RTERM_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_0_OVRD_EN_ANA_EN_RTERM_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_6_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 4, 0, 0x6);
+ set_fields_e56(&rdata, 14, 13, 0x2);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_1_OVRD_EN_ANA_BBCDR_VCOFILT_BYP_I,
+ 0x1);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_1_OVRD_EN_ANA_TEST_BBCDR_I, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_15_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 2, 0, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_17_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDVAL_17_ANA_VGA2_BOOST_CSTM_I, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_3_OVRD_EN_ANA_ANABS_CONFIG_I, 0x1);
+ set_fields_e56(&rdata, E56PHY_RXS_ANA_OVRDEN_3_OVRD_EN_ANA_VGA2_BOOST_CSTM_I,
+ 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDVAL_14_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 13, 13, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_ANA_OVRDEN_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 13, 13, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_EYE_SCAN_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_RXS_EYE_SCAN_1_EYE_SCAN_REF_TIMER, 0x400);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_RXS_RINGO_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, 21, 12, 0x366);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_PMD_CFG_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_3_CTRL_FSM_TIMEOUT_X64K, 0x80);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_PMD_CFG_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_4_TRAIN_DC_ON_PERIOD_X64K, 0x18);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_4_TRAIN_DC_PERIOD_X512K, 0x3e);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_PMD_CFG_5_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_PMD_CFG_5_USE_RECENT_MARKER_OFFSET, 0x1);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_0_CONT_ON_ADC_GAIN_CAL_ERR, 0x1);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_0_DO_RX_ADC_OFST_CAL, 0x3);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_0_RX_ERR_ACTION_EN, 0x40);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_1_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST0_WAIT_CNT_X4096, 0xff);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST1_WAIT_CNT_X4096, 0xff);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST2_WAIT_CNT_X4096, 0xff);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_1_TRAIN_ST3_WAIT_CNT_X4096, 0xff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_2_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST4_WAIT_CNT_X4096, 0x1);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST5_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST6_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_2_TRAIN_ST7_WAIT_CNT_X4096, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_3_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST8_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST9_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST10_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_3_TRAIN_ST11_WAIT_CNT_X4096, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_4_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST12_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST13_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST14_WAIT_CNT_X4096, 0x4);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_4_TRAIN_ST15_WAIT_CNT_X4096, 0x4);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_7_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_7_TRAIN_ST4_EN, 0x4bf);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_7_TRAIN_ST5_EN, 0xc4bf);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_8_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_8_TRAIN_ST7_EN, 0x47ff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_12_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_12_TRAIN_ST15_EN, 0x67ff);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_13_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_13_TRAIN_ST0_DONE_EN, 0x8001);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_13_TRAIN_ST1_DONE_EN, 0x8002);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_14_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_14_TRAIN_ST3_DONE_EN, 0x8008);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_15_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_15_TRAIN_ST4_DONE_EN, 0x8004);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_17_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_17_TRAIN_ST8_DONE_EN, 0x20c0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_18_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_18_TRAIN_ST10_DONE_EN, 0x0);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_29_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_29_TRAIN_ST15_DC_EN, 0x3f6d);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_33_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_33_TRAIN0_RATE_SEL, 0x8000);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_33_TRAIN1_RATE_SEL, 0x8000);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_CTRL_FSM_CFG_34_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_34_TRAIN2_RATE_SEL, 0x8000);
+ set_fields_e56(&rdata, E56PHY_CTRL_FSM_CFG_34_TRAIN3_RATE_SEL, 0x8000);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_KRT_TFSM_CFG_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_KRT_TFSM_CFGKRT_TFSM_MAX_WAIT_TIMER_X1000K, 0x49);
+ set_fields_e56(&rdata, E56PHY_KRT_TFSM_CFGKRT_TFSM_MAX_WAIT_TIMER_X8000K, 0x37);
+ set_fields_e56(&rdata, E56PHY_KRT_TFSM_CFGKRT_TFSM_HOLDOFF_TIMER_X256K, 0x2f);
+ wr32_ephy(hw, addr, rdata);
+
+ rdata = 0x0000;
+ addr = E56PHY_FETX_FFE_TRAIN_CFG_0_ADDR;
+ rdata = rd32_ephy(hw, addr);
+ set_fields_e56(&rdata, E56PHY_FETX_FFE_TRAIN_CFG_0_KRT_FETX_INIT_FFE_CFG_2,
+ 0x2);
+ wr32_ephy(hw, addr, rdata);
+
+ return 0;
+}
+
+static int txgbe_set_phy_link_mode(struct txgbe_hw *hw,
+ u8 bp_link_mode)
+{
+ int status = 0;
+ u32 rdata = 0;
+
+ u32 speed_select = 0;
+ u32 pcs_type_sel = 0;
+ u32 cns_en = 0;
+ u32 rsfec_en = 0;
+ u32 pma_type = 0;
+ u32 an0_rate_select = 0;
+
+ switch (bp_link_mode) {
+ case 10:
+ bp_link_mode = 10;
+ speed_select = 0; /* 10 Gb/s */
+ pcs_type_sel = 0; /* 10GBASE-R PCS Type */
+ cns_en = 0; /* CNS_EN disable */
+ rsfec_en = 0; /* RS-FEC disable */
+ pma_type = 0xb; /* 10GBASE-KR PMA/PMD type */
+ an0_rate_select = 2; /* 10G-KR */
+ break;
+ case 40:
+ bp_link_mode = 40;
+ speed_select = 3; /* 40 Gb/s */
+ pcs_type_sel = 4; /* 40GBASE-R PCS Type */
+ cns_en = 0; /* CNS_EN disable */
+ rsfec_en = 0; /* RS-FEC disable */
+ pma_type = 0b0100001; /* 40GBASE-CR PMA/PMD type */
+ an0_rate_select = 4; /* 40G-KR: 3 40G-CR: 4 */
+ break;
+ case 25:
+ bp_link_mode = 25;
+ speed_select = 5; /* 25 Gb/s */
+ pcs_type_sel = 7; /* 25GBASE-R PCS Type */
+ cns_en = 1; /* CNS_EN */
+ rsfec_en = 1; /* RS-FEC enable*/
+ pma_type = 0b0111001; /* 25GBASE-KR PMA/PMD type */
+ an0_rate_select = 9; /* 9/10/17 25GK/CR-S or 25GK/CR */
+ break;
+ default:
+ BP_LOG("%s %d :Invalid bp_link_mode\n", __func__, __LINE__);
+ break;
+ }
+
+ hw->curbp_link_mode = bp_link_mode;
+ /* To switch to the 40G mode Ethernet operation, complete the following steps:*/
+ /* 1. Initiate the vendor-specific software reset by programming
+ * the VR_RST field (bit [15]) of the VR_PCS_DIG_CTRL1 register to 1.
+ */
+ rdata = rd32_epcs(hw, 0x038000);
+ wr32_epcs(hw, 0x038000, rdata | BIT(15));
+
+ /* 2. Wait for the hardware to clear the value for the VR_RST
+ * field (bit [15]) of the VR_PCS_DIG_CTRL1 register.
+ */
+ BP_LOG("Wait for the bit [15] (VR_RST) to get cleared.\n");
+ status = kr_read_poll(rd32_ephy, rdata,
+ FIELD_GET_M(BIT(15), rdata) == 0, 100,
+ 2000, hw, 0x038000);
+ BP_LOG("Wait PHY VR_RST = %x, Wait VR_RST %s.\n",
+ rdata, status ? "FAILED" : "SUCCESS");
+
+ /* wait rx/tx/cm powerdn_st according pmd 50 2.0.5 */
+ status = kr_read_poll(rd32_ephy, rdata,
+ (rdata & GENMASK(3, 0)) == 0x9, 100,
+ 2000, hw, 0x14d4);
+ BP_LOG("wait ctrl_fsm_cm_st = %x, %s.\n",
+ rdata, status ? "FAILED" : "SUCCESS");
+
+ /* 3. Write 4'b0011 to bits [5:2] of the SR_PCS_CTRL1 register.
+ * 10G: 0 25G: 5 40G: 3
+ */
+ rdata = rd32_epcs(hw, 0x030000);
+ set_fields_e56(&rdata, 5, 2, speed_select);
+ wr32_epcs(hw, 0x030000, rdata);
+
+ /* 4. Write pcs mode sel to bits [3:0] of the SR_PCS_CTRL2 register.
+ * 10G: 0 25G: 4'b0111 40G: 4'b0100
+ */
+ rdata = rd32_epcs(hw, 0x030007);
+ set_fields_e56(&rdata, 3, 0, pcs_type_sel);
+ wr32_epcs(hw, 0x030007, rdata);
+
+ /* 0 1 1 1 0 0 1 : 25GBASE-KR or 25GBASE-KR-S PMA/PMD type
+ * 0 1 1 1 0 0 0 : 25GBASE-CR or 25GBASE-CR-S PMA/PMD type
+ * 0 1 0 0 0 0 1 : 40GBASE-CR4 PMA/PMD type
+ * 0 1 0 0 0 0 0 : 40GBASE-KR4 PMA/PMD type
+ * 0 0 0 1 0 1 1 : 10GBASE-KR PMA/PMD type
+ */
+ rdata = rd32_epcs(hw, 0x010007);
+ set_fields_e56(&rdata, 6, 0, pma_type);
+ wr32_epcs(hw, 0x010007, rdata);
+
+ /* 5. Write only 25g en to Bits [1:0] of VR_PCS_DIG_CTRL3 register. */
+ rdata = rd32_epcs(hw, 0x38003);
+ set_fields_e56(&rdata, 1, 0, cns_en);
+ wr32_epcs(hw, 0x38003, rdata);
+
+ /* 6. Program PCS_AM_CNT field of VR_PCS_AM_CNT register to 'd16383 to
+ * configure the alignment marker interval. To speed-up simulation,
+ * program a smaller value to this field.
+ */
+ if (bp_link_mode == 40)
+ wr32_epcs(hw, 0x38018, 16383);
+
+ /* 7. Program bit [2] of SR_PMA_RS_FEC_CTRL register to 0
+ * if previously 1 (as RS-FEC is supported in 25G Mode).
+ */
+
+ rdata = rd32_epcs(hw, 0x100c8);
+ set_fields_e56(&rdata, 2, 2, rsfec_en);
+ wr32_epcs(hw, 0x100c8, rdata);
+
+ /* 8. To enable BASE-R FEC (if desired), set bit [0].
+ * in SR_PMA_KR_FEC_CTRL register
+ */
+
+ /* 4. set phy an status to 0 */
+ rdata = rd32_ephy(hw, 0x1434);
+ set_fields_e56(&rdata, 7, 4, 0xe);
+ wr32_ephy(hw, 0x1434, rdata);
+
+ /* 9. Program Enterprise 56G PHY regs through its own APB interface:
+ * a. Program PHY registers as mentioned in Table 6-6 on page 1197 to
+ * configure the PHY to 40G
+ * Mode. For fast-simulation mode, additionally program,
+ * the registers shown in the Table 6-7 on page 1199
+ * b. Enable the PMD by setting pmd_en field in PMD_CFG[0] (0x1400)
+ * register
+ */
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, ANA_OVRDVAL0);
+ set_fields_e56(&rdata, 29, 29, 0x1);
+ set_fields_e56(&rdata, 1, 1, 0x1);
+ wr32_ephy(hw, ANA_OVRDVAL0, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, ANA_OVRDVAL5);
+ set_fields_e56(&rdata, 24, 24, 0x1);
+ wr32_ephy(hw, ANA_OVRDVAL5, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, ANA_OVRDEN0);
+ set_fields_e56(&rdata, 1, 1, 0x1);
+ wr32_ephy(hw, ANA_OVRDEN0, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, ANA_OVRDEN1);
+ set_fields_e56(&rdata, 30, 30, 0x1);
+ set_fields_e56(&rdata, 25, 25, 0x1);
+ wr32_ephy(hw, ANA_OVRDEN1, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, PLL0_CFG0);
+ set_fields_e56(&rdata, 25, 24, 0x1);
+ set_fields_e56(&rdata, 17, 16, 0x3);
+ wr32_ephy(hw, PLL0_CFG0, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, PLL0_CFG2);
+ set_fields_e56(&rdata, 12, 8, 0x4);
+ wr32_ephy(hw, PLL0_CFG2, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, PLL1_CFG0);
+ set_fields_e56(&rdata, 25, 24, 0x1);
+ set_fields_e56(&rdata, 17, 16, 0x3);
+ wr32_ephy(hw, PLL1_CFG0, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, PLL1_CFG2);
+ set_fields_e56(&rdata, 12, 8, 0x8);
+ wr32_ephy(hw, PLL1_CFG2, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, PLL0_DIV_CFG0);
+ set_fields_e56(&rdata, 18, 8, 0x294);
+ set_fields_e56(&rdata, 4, 0, 0x8);
+ wr32_ephy(hw, PLL0_DIV_CFG0, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, DATAPATH_CFG0);
+ set_fields_e56(&rdata, 30, 28, 0x7);
+ set_fields_e56(&rdata, 26, 24, 0x5);
+ if (bp_link_mode == 10 || bp_link_mode == 40)
+ set_fields_e56(&rdata, 18, 16, 0x5);
+ else if (bp_link_mode == 25)
+ set_fields_e56(&rdata, 18, 16, 0x3);
+ set_fields_e56(&rdata, 14, 12, 0x5);
+ set_fields_e56(&rdata, 10, 8, 0x5);
+ wr32_ephy(hw, DATAPATH_CFG0, rdata);
+
+ rdata = 0x0000;
+ rdata = rd32_ephy(hw, DATAPATH_CFG1);
+ set_fields_e56(&rdata, 26, 24, 0x5);
+ set_fields_e56(&rdata, 10, 8, 0x5);
+ if (bp_link_mode == 10 || bp_link_mode == 40) {
+ set_fields_e56(&rdata, 18, 16, 0x5);
+ set_fields_e56(&rdata, 2, 0, 0x5);
+ } else if (bp_link_mode == 25) {
+ set_fields_e56(&rdata, 18, 16, 0x3);
+ set_fields_e56(&rdata, 2, 0, 0x3);
+ }
+ wr32_ephy(hw, DATAPATH_CFG1, rdata);
+
+ rdata = rd32_ephy(hw, AN_CFG1);
+ set_fields_e56(&rdata, 4, 0, an0_rate_select);
+ wr32_ephy(hw, AN_CFG1, rdata);
+
+ status = txgbe_e56_cms_cfg_for_temp_track_range(hw);
+
+ if (bp_link_mode == 10)
+ txgbe_e56_bp_cfg_10g(hw);
+ else if (bp_link_mode == 25)
+ txgbe_e56_bp_cfg_25g(hw);
+ else if (bp_link_mode == 40)
+ txgbe_e56_cfg_40g(hw);
+
+ return status;
+}
+
+int txgbe_e56_set_phy_link_mode(struct txgbe_hw *hw,
+ u8 bp_link_mode, u32 need_restart)
+{
+ int status = 0;
+ u32 rdata;
+
+ UNREFERENCED_PARAMETER(bp_link_mode);
+
+ hw->an_done = false;
+ if (hw->curbp_link_mode == 10 && !need_restart)
+ return 0;
+ BP_LOG("Setup to backplane mode ==========\n");
+
+ u32 backplane_mode = 0;
+ u32 fec_advertise = 0;
+
+ hw->an_done = false;
+ /* pcs + phy rst */
+ rdata = rd32(hw, 0x1000c);
+ if (hw->bus.lan_id == 1)
+ rdata |= BIT(16);
+ else
+ rdata |= BIT(19);
+ wr32(hw, 0x1000c, rdata);
+ msleep(20);
+
+ /* clear interrupt */
+ wr32_epcs(hw, 0x070000, 0);
+ wr32_epcs(hw, 0x030000, 0x8000);
+ rdata = rd32_epcs(hw, 0x070000);
+ set_fields_e56(&rdata, 12, 12, 0x1);
+ wr32_epcs(hw, 0x070000, rdata);
+ wr32_epcs(hw, 0x078002, 0x0000);
+ /* pcs case fec en to work around first */
+ wr32_epcs(hw, 0x100ab, 1);
+
+ if (txgbe_is_backplane(hw)) {
+ /* backplane 10G/25G/40G */
+ /* 10GKR:7-25KR:14/15-40GKR:8-40GCR:9 */
+ /* default all speed */
+ if ((hw->device_id & 0xFF) == 0x10) {
+ backplane_mode |= BIT(7);
+ fec_advertise |= TXGBE_10G_FEC_ABL;
+ } else if ((hw->device_id & 0xFF) == 0x25) {
+ backplane_mode |= BIT(14) | BIT(15);
+ fec_advertise |= TXGBE_25G_RS_FEC_REQ |
+ TXGBE_25G_BASE_FEC_REQ;
+ } else if ((hw->device_id & 0xFF) == 0x40) {
+ if (hw->phy.bp_capa == 0)
+ /* original configure: KR4 + CR4 */
+ backplane_mode |= BIT(9) | BIT(8);
+ else if (hw->phy.bp_capa == 1)
+ /* only 40GBASE-KR4 */
+ backplane_mode |= BIT(8);
+ else if (hw->phy.bp_capa == 2)
+ /* only 40GBASE-CR4 */
+ backplane_mode |= BIT(9);
+ fec_advertise |= TXGBE_10G_FEC_ABL;
+ BP_LOG("Advertised abilities: %d\n", backplane_mode);
+ }
+ } else {
+ if ((hw->phy.fiber_suppport_speed & TXGBE_LINK_SPEED_10GB_FULL)
+ == TXGBE_LINK_SPEED_10GB_FULL) {
+ backplane_mode |= 0x80;
+ fec_advertise |= TXGBE_10G_FEC_ABL;
+ }
+
+ if ((hw->phy.fiber_suppport_speed & TXGBE_LINK_SPEED_25GB_FULL)
+ == TXGBE_LINK_SPEED_25GB_FULL) {
+ backplane_mode |= 0xc000;
+ fec_advertise |= TXGBE_25G_RS_FEC_REQ |
+ TXGBE_25G_BASE_FEC_REQ;
+ }
+
+ if ((hw->phy.fiber_suppport_speed & TXGBE_LINK_SPEED_40GB_FULL)
+ == TXGBE_LINK_SPEED_40GB_FULL) {
+ backplane_mode |= BIT(9) | BIT(8);
+ fec_advertise |= TXGBE_10G_FEC_ABL;
+ }
+ }
+
+ wr32_epcs(hw, 0x070010, 0x0001);
+
+ /* 10GKR:7-25KR:14/15-40GKR:8-40GCR:9 */
+ wr32_epcs(hw, 0x070011, backplane_mode | 0x11);
+
+ /* BASE-R FEC */
+ rdata = rd32_epcs(hw, 0x70012);
+ wr32_epcs(hw, 0x70012, fec_advertise);
+
+ wr32_epcs(hw, 0x070016, 0x0000);
+ wr32_epcs(hw, 0x070017, 0x0);
+ wr32_epcs(hw, 0x070018, 0x0);
+
+ /* config timer */
+ wr32_epcs(hw, 0x078004, 0x003c);
+ wr32_epcs(hw, 0x078005, CL74_KRTR_TRAINNING_TIMEOUT);
+ wr32_epcs(hw, 0x078006, 25);
+ wr32_epcs(hw, 0x078000, 0x0008 | BIT(2));
+
+ BP_LOG("1.2 Wait 10G KR phy/pcs mode init ....\n");
+ status = txgbe_set_phy_link_mode(hw, 10);
+ BP_LOG("Wait 10g phy/pcs mode init = %x, %s.\n", rdata,
+ /* wait rx/tx/cm powerdn_st according pmd 50 2.0.5 */
+ status ? "FAILED" : "SUCCESS");
+
+ /* 5. CM_ENABLE */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 21, 20, 0x3); /* pll en */
+ set_fields_e56(&rdata, 19, 12, 0x0); /* tx disable */
+ set_fields_e56(&rdata, 8, 8, 0x0); /* pmd mode */
+ set_fields_e56(&rdata, 1, 1, 0x1); /* pmd en */
+ wr32_ephy(hw, 0x1400, rdata);
+
+ /* 6, TX_ENABLE */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 19, 12, 0x1); /* tx en */
+ wr32_ephy(hw, 0x1400, rdata);
+
+ BP_LOG("1.3 Wait 10G PHY RXS....\n");
+ status = txgbe_e56_rxs_osc_init_for_temp_track_range(hw, 10);
+ BP_LOG("Wait 10G PHY/RXS mode init = %x, %s.\n", rdata,
+ status ? "FAILED" : "SUCCESS");
+
+ /* Wait an 10g fsm_rx_sts */
+ status = kr_read_poll(rd32_ephy, rdata,
+ ((rdata & 0x3f) == 0xb), 1000,
+ 200, hw,
+ E56PHY_CTRL_FSM_RX_STAT_0_ADDR);
+ BP_LOG("Wait 10g fsm_rx_sts = %x, Wait rx_sts %s.\n", rdata,
+ status ? "FAILED" : "SUCCESS");
+ rdata = rd32_epcs(hw, 0x070000);
+ set_fields_e56(&rdata, 12, 12, 0x1);
+ wr32_epcs(hw, 0x070000, rdata);
+ BP_LOG("Setup the backplane mode========end ==\n");
+
+ return status;
+}
+
+static void txgbe_e56_print_page_status(struct txgbe_hw *hw,
+ struct txgbe_backplane_ability *local_ability,
+ struct txgbe_backplane_ability *lp_ability)
+{
+ u32 rdata = 0;
+
+ /* Read the local AN73 Base Page Ability Registers */
+ BP_LOG("Read the local Base Page Ability Registers\n");
+ rdata = rd32_epcs(hw, SR_AN_MMD_ADV_REG1);
+ local_ability->next_page = (rdata & BIT(15)) ? 1 : 0;
+ BP_LOG("\tread 70010 data %0x\n", rdata);
+ rdata = rd32_epcs(hw, SR_AN_MMD_ADV_REG2);
+ BP_LOG("\tread 70011 data %0x\n", rdata);
+ local_ability->link_ability = (rdata >> 5) & GENMASK(10, 0);
+ /* amber-lite only support 10GKR - 25GKR/CR - 25GKR-S/CR-S */
+ BP_LOG("\t10GKR : %x\t25GKR-S/CR-S: %x\t25GKR/CR : %x\n",
+ local_ability->link_ability & BIT(ABILITY_10GBASE_KR) ? 1 : 0,
+ local_ability->link_ability & BIT(ABILITY_25GBASE_KRCR_S) ? 1 : 0,
+ local_ability->link_ability & BIT(ABILITY_25GBASE_KRCR) ? 1 : 0);
+ BP_LOG("\t40GCR4 : %x\t40GKR4 : %x\n",
+ local_ability->link_ability & BIT(ABILITY_40GBASE_CR4) ? 1 : 0,
+ local_ability->link_ability & BIT(ABILITY_40GBASE_KR4) ? 1 : 0);
+ rdata = rd32_epcs(hw, SR_AN_MMD_ADV_REG3);
+ BP_LOG("\tF1:FEC Req\tF0:FEC Sup\tF3:25GFEC\tF2:25GRS\n");
+ BP_LOG("\tF1: %d\t\tF0: %d\t\tF3: %d\t\tF2: %d\n",
+ ((rdata >> 15) & 0x01), ((rdata >> 14) & 0x01),
+ ((rdata >> 13) & 0x01), ((rdata >> 12) & 0x01));
+ local_ability->fec_ability = rdata;
+ BP_LOG("\tread 70012 data %0x\n", rdata);
+
+ /* Read the link partner AN73 Base Page Ability Registers */
+ BP_LOG("Read the link partner Base Page Ability Registers\n");
+ rdata = rd32_epcs(hw, SR_AN_MMD_LP_ABL1);
+ lp_ability->next_page = (rdata & BIT(15)) ? 1 : 0;
+ BP_LOG("\tread 70013 data %0x\n", rdata);
+ rdata = rd32_epcs(hw, SR_AN_MMD_LP_ABL2);
+ lp_ability->link_ability = (rdata >> 5) & GENMASK(10, 0);
+ BP_LOG("\tread 70014 data %0x\n", rdata);
+ BP_LOG("\tKX : %x\tKX4 : %x\n",
+ lp_ability->link_ability & BIT(ABILITY_1000BASE_KX) ? 1 : 0,
+ lp_ability->link_ability & BIT(ABILITY_10GBASE_KX4) ? 1 : 0);
+ BP_LOG("\t10GKR : %x\t25GKR-S/CR-S: %x\t25GKR/CR : %x\n",
+ lp_ability->link_ability & BIT(ABILITY_10GBASE_KR) ? 1 : 0,
+ lp_ability->link_ability & BIT(ABILITY_25GBASE_KRCR_S) ? 1 : 0,
+ lp_ability->link_ability & BIT(ABILITY_25GBASE_KRCR) ? 1 : 0);
+ BP_LOG("\t40GCR4 : %x\t40GKR4 : %x\n",
+ lp_ability->link_ability & BIT(ABILITY_40GBASE_CR4) ? 1 : 0,
+ lp_ability->link_ability & BIT(ABILITY_40GBASE_KR4) ? 1 : 0);
+ rdata = rd32_epcs(hw, SR_AN_MMD_LP_ABL3);
+ BP_LOG("\tF1:FEC Req\tF0:FEC Sup\tF3:25GFEC\tF2:25GRS\n");
+ BP_LOG("\tF1: %d\t\tF0: %d\t\tF3: %d\t\tF2: %d\n",
+ ((rdata >> 15) & 0x01), ((rdata >> 14) & 0x01),
+ ((rdata >> 13) & 0x01), ((rdata >> 12) & 0x01));
+ lp_ability->fec_ability = rdata;
+
+ hw->phy.fec_mode = 0;
+ if (rdata & TXGBE_25G_RS_FEC_REQ)
+ hw->phy.fec_mode |= TXGBE_25G_RS_FEC_REQ;
+ if (rdata & TXGBE_25G_BASE_FEC_REQ)
+ hw->phy.fec_mode |= TXGBE_25G_BASE_FEC_REQ;
+ if (rdata & TXGBE_10G_FEC_ABL)
+ hw->phy.fec_mode |= TXGBE_10G_FEC_ABL;
+ if (rdata & TXGBE_10G_FEC_REQ)
+ hw->phy.fec_mode |= TXGBE_10G_FEC_REQ;
+ BP_LOG("\tread 70015 data %0x\n", rdata);
+
+ BP_LOG("\tread 70016 data %0x\n", rd32_epcs(hw, 0x70016));
+ BP_LOG("\tread 70017 data %0x\n", rd32_epcs(hw, 0x70017));
+ BP_LOG("\tread 70018 data %0x\n", rd32_epcs(hw, 0x70018));
+ BP_LOG("\tread 70019 data %0x\n", rd32_epcs(hw, 0x70019));
+ BP_LOG("\tread 7001a data %0x\n", rd32_epcs(hw, 0x7001a));
+ BP_LOG("\tread 7001b data %0x\n", rd32_epcs(hw, 0x7001b));
+}
+
+static int chk_bkp_ability(struct txgbe_hw *hw,
+ struct txgbe_backplane_ability local_ability,
+ struct txgbe_backplane_ability lp_ability)
+{
+ unsigned int com_link_ability;
+
+ BP_LOG("CheckBkpAn73Ability():\n");
+ /* Check the common link ability and take action based on the result*/
+ com_link_ability = local_ability.link_ability &
+ lp_ability.link_ability;
+ BP_LOG("comAbility= 0x%x, Ability= 0x%x, lpAbility= 0x%x\n",
+ com_link_ability, local_ability.link_ability,
+ lp_ability.link_ability);
+
+ if (com_link_ability == 0) {
+ hw->bp_link_mode = 0;
+ BP_LOG("Do not support any compatible speed mode!\n");
+ return -EINVAL;
+ } else if (com_link_ability & BIT(ABILITY_40GBASE_KR4)) {
+ BP_LOG("Link mode is [ABILITY_40GBASE_KR4].\n");
+ hw->bp_link_mode = 40;
+ } else if (com_link_ability & BIT(ABILITY_40GBASE_CR4)) {
+ BP_LOG("Link mode is [ABILITY_40GBASE_CR4].\n");
+ hw->bp_link_mode = 40;
+ } else if (com_link_ability & BIT(ABILITY_25GBASE_KRCR_S)) {
+ BP_LOG("Link mode is [ABILITY_25GBASE_KRCR_S].\n");
+ hw->fec_mode = TXGBE_25G_RS_FEC_REQ;
+ hw->bp_link_mode = 25;
+ } else if (com_link_ability & BIT(ABILITY_25GBASE_KRCR)) {
+ BP_LOG("Link mode is [ABILITY_25GBASE_KRCR].\n");
+ hw->bp_link_mode = 25;
+ } else if (com_link_ability & BIT(ABILITY_10GBASE_KR)) {
+ BP_LOG("Link mode is [ABILITY_10GBASE_KR].\n");
+ hw->bp_link_mode = 10;
+ } else if (com_link_ability & BIT(ABILITY_10GBASE_KX4)) {
+ BP_LOG("Link mode is [ABILITY_10GBASE_KX4].\n");
+ hw->bp_link_mode = 10;
+ } else if (com_link_ability & BIT(ABILITY_1000BASE_KX)) {
+ BP_LOG("Link mode is [ABILITY_1000BASE_KX].\n");
+ hw->bp_link_mode = 1;
+ } else {
+ BP_LOG("No compatible link mode found!\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int txgbe_e56_exchange_page(struct txgbe_hw *hw)
+{
+ struct txgbe_backplane_ability local_ability = {0}, lp_ability = {0};
+ u32 an_int, base_page = 0;
+ int count = 0;
+
+ an_int = rd32_epcs(hw, 0x78002);
+ /* 500ms timeout */
+ if (!(an_int & VR_AN_INTR_PG_RCV))
+ return -EINVAL;
+
+ for (count = 0; count < 500; count++) {
+ u32 fsm = rd32_epcs(hw, 0x78010);
+ u32 rdata = rd32_epcs(hw, 0x78002);
+
+ BP_LOG("-----count----- %d - fsm: %x\n", count, fsm);
+ BP_LOG("read 78002 data %0x and clear pacv\n", rdata);
+ an_int = rdata;
+ set_fields_e56(&rdata, 2, 2, 0x0);
+ wr32_epcs(hw, 0x78002, rdata);
+ if (an_int & VR_AN_INTR_PG_RCV) {
+ u32 addr;
+
+ txgbe_e56_print_page_status(hw, &local_ability, &lp_ability);
+ addr = base_page == 0 ? 0x70013 : 0x70019;
+ rdata = rd32_epcs(hw, addr);
+ if (rdata & BIT(14)) {
+ if (rdata & BIT(15)) {
+ /* always set null message */
+ wr32_epcs(hw, 0x70016, 0x2001);
+ BP_LOG("write 70016 0x%0x\n",
+ 0x2001);
+ }
+ base_page = 1;
+ }
+ }
+ if ((fsm & 0x8) == 0x8) {
+ hw->fsm = 0x8;
+ goto check_ability;
+ }
+ usec_delay(100);
+ }
+
+check_ability:
+ return chk_bkp_ability(hw, local_ability, lp_ability);
+}
+
+static int txgbe_e56_cl72_trainning(struct txgbe_hw *hw)
+{
+ u32 bylinkmode = hw->bp_link_mode;
+ u8 bypass_ctle = hw->bypass_ctle;
+ int status = 0, temp_data = 0;
+ u32 lane_num = 0, lane_idx = 0;
+ u32 __rte_unused pmd_ctrl = 0, txffe = 0;
+ int ret = 0;
+ u32 rdata;
+
+ u8 pll_en_cfg = 0;
+ u8 pmd_mode = 0;
+
+ switch (bylinkmode) {
+ case 10:
+ bylinkmode = 10;
+ lane_num = 1;
+ pll_en_cfg = 3;
+ pmd_mode = 0;
+ break;
+ case 40:
+ bylinkmode = 40;
+ lane_num = 4;
+ pll_en_cfg = 0; /* pll_en_cfg : single link to 0 */
+ pmd_mode = 1; /* pmd mode : 1 - single link */
+ break;
+ case 25:
+ bylinkmode = 25;
+ lane_num = 1;
+ pll_en_cfg = 3;
+ pmd_mode = 0;
+ break;
+ default:
+ BP_LOG("%s %d :Invalid speed\n", __func__, __LINE__);
+ break;
+ }
+
+ BP_LOG("2.3 Wait %dG KR phy mode init ....\n", bylinkmode);
+ status = txgbe_set_phy_link_mode(hw, bylinkmode);
+
+ /* 13. set phy an status to 1 - AN_CFG[0]: 4-7 lane0-lane3 */
+ rdata = rd32_ephy(hw, 0x1434);
+ set_fields_e56(&rdata, 7, 4, GENMASK(lane_num - 1, 0));
+ wr32_ephy(hw, 0x1434, rdata);
+
+ /* 14 and 15. kr training: set BASER_PMD_CONTROL[0, 7] for lane0-4 */
+ rdata = rd32_ephy(hw, 0x1640);
+ set_fields_e56(&rdata, 7, 0, GENMASK(2 * lane_num - 1, 0));
+ wr32_ephy(hw, 0x1640, rdata);
+
+ /* 16. enable CMS and its internal PLL */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 21, 20, pll_en_cfg);
+ set_fields_e56(&rdata, 19, 12, 0); /* tx/rx off */
+ set_fields_e56(&rdata, 8, 8, pmd_mode);
+ set_fields_e56(&rdata, 1, 1, 0x1); /* pmd en */
+ wr32_ephy(hw, 0x1400, rdata);
+
+ /* 17. tx enable PMD_CFG[0] */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 15, 12, GENMASK(lane_num - 1, 0)); /* tx en */
+ wr32_ephy(hw, 0x1400, rdata);
+
+ /* 18 */
+ /* 19. rxs calibration and adaotation sequeence */
+ BP_LOG("2.4 Wait %dG RXS.... fsm: %x\n",
+ bylinkmode, rd32_epcs(hw, 0x78010));
+ status = txgbe_e56_phy_rxs_calib_adapt_seq(hw, bylinkmode, bypass_ctle);
+ ret |= status;
+ /* 20 */
+ BP_LOG("2.5 Wait %dG phy calibration.... fsm: %x\n",
+ bylinkmode, rd32_epcs(hw, 0x78010));
+ txgbe_e56_set_rxs_ufine_le_max(hw, bylinkmode);
+ status = txgbe_e56_get_temp(hw, &temp_data);
+ if (bylinkmode == 40)
+ status = txgbe_temp_track_seq_40g(hw, TXGBE_LINK_SPEED_40GB_FULL);
+ else
+ status = txgbe_e56_rxs_post_cdr_lock_temp_track_seq(hw, bylinkmode);
+ /* 21 */
+ BP_LOG("2.6 Wait %dG phy kr training check.... fsm: %x\n",
+ bylinkmode, rd32_epcs(hw, 0x78010));
+ status = kr_read_poll(rd32_ephy, rdata,
+ ((rdata & 0xe) & GENMASK(lane_num, 1)) ==
+ (0xe & GENMASK(lane_num, 1)), 100,
+ 10000, hw, 0x163c);
+ pmd_ctrl = rd32_ephy(hw, 0x1644);
+ BP_LOG("KR TRAINNING CHECK = %x, %s. pmd_ctrl:%lx-%lx-%lx-%lx\n",
+ rdata, status ? "FAILED" : "SUCCESS",
+ FIELD_GET_M(GENMASK(3, 0), pmd_ctrl),
+ FIELD_GET_M(GENMASK(7, 4), pmd_ctrl),
+ FIELD_GET_M(GENMASK(11, 8), pmd_ctrl),
+ FIELD_GET_M(GENMASK(15, 12), pmd_ctrl));
+ ret |= status;
+ BP_LOG("before: %x-%x-%x-%x\n",
+ rd32_ephy(hw, 0x141c), rd32_ephy(hw, 0x1420),
+ rd32_ephy(hw, 0x1424), rd32_ephy(hw, 0x1428));
+
+ for (lane_idx = 0; lane_idx < lane_num; lane_idx++) {
+ txffe = rd32_ephy(hw, 0x828 + lane_idx * 0x100);
+ BP_LOG("after[%x]: %lx-%lx-%lx-%lx\n", lane_idx,
+ FIELD_GET_M(GENMASK(6, 0), txffe),
+ FIELD_GET_M(GENMASK(21, 16), txffe),
+ FIELD_GET_M(GENMASK(29, 24), txffe),
+ FIELD_GET_M(GENMASK(13, 8), txffe));
+ }
+
+ /* 22 */
+ BP_LOG("2.7 Wait %dG phy Rx adc.... fsm:%x\n",
+ bylinkmode, rd32_epcs(hw, 0x78010));
+ status = txgbe_e56_rxs_adc_adapt_seq(hw, bypass_ctle);
+
+ return ret;
+}
+
+int handle_e56_bkp_an73_flow(struct txgbe_hw *hw)
+{
+ int status = 0;
+ u32 rdata;
+
+ BP_LOG("2.1 Wait page changed ....\n");
+ status = txgbe_e56_exchange_page(hw);
+ if (status) {
+ BP_LOG("Exchange page failed\n");
+ return status;
+ }
+
+ BP_LOG("2.2 Wait page changed ..done..\n");
+ wr32_epcs(hw, 0x100ab, 0);
+ if (AN_TRAINNING_MODE) {
+ rdata = rd32_epcs(hw, 0x70000);
+ BP_LOG("read 0x70000 data %0x\n", rdata);
+ wr32_epcs(hw, 0x70000, 0);
+ BP_LOG("write 0x70000 0x%0x\n", 0);
+ }
+
+ rdata = rd32_epcs(hw, 0x78002);
+ BP_LOG("read 78002 data %0x and clear page int\n", rdata);
+ set_fields_e56(&rdata, 2, 2, 0x0);
+ wr32_epcs(hw, 0x78002, rdata);
+
+ /* dis phy tx/rx lane */
+ rdata = rd32_ephy(hw, 0x1400);
+ set_fields_e56(&rdata, 19, 16, 0x0);
+ set_fields_e56(&rdata, 15, 12, 0x0);
+ set_fields_e56(&rdata, 1, 1, 0x0);
+ wr32_ephy(hw, 0x1400, rdata);
+ BP_LOG("Ephy Write A: 0x%x, D: 0x%x\n", 0x1400, rdata);
+
+ /* wait rx/tx/cm powerdn_st */
+ status = kr_read_poll(rd32_ephy, rdata,
+ (rdata & GENMASK(3, 0)) == 0x9, 100,
+ 2000, hw, 0x14d4);
+ BP_LOG("wait ctrl_fsm_cm_st = %x, %s.\n",
+ rdata, status ? "FAILED" : "SUCCESS");
+
+ if (hw->phy.fec_mode & TXGBE_25G_RS_FEC_REQ) {
+ wr32_epcs(hw, 0x180a3, 0x68c1);
+ wr32_epcs(hw, 0x180a4, 0x3321);
+ wr32_epcs(hw, 0x180a5, 0x973e);
+ wr32_epcs(hw, 0x180a6, 0xccde);
+
+ wr32_epcs(hw, 0x38018, 1024);
+ rdata = rd32_epcs(hw, 0x100c8);
+ set_fields_e56(&rdata, 2, 2, 1);
+ wr32_epcs(hw, 0x100c8, rdata);
+ BP_LOG("Advertised FEC modes : %s\n", "RS-FEC");
+ hw->cur_fec_link = TXGBE_PHY_FEC_RS;
+ } else if (hw->phy.fec_mode & TXGBE_25G_BASE_FEC_REQ) {
+ /* FEC: FC-FEC/BASE-R */
+ wr32_epcs(hw, 0x100ab, BIT(0));
+ BP_LOG("Epcs Write A: 0x%x, D: 0x%x\n", 0x100ab, 1);
+ PMD_DRV_LOG(INFO, "Advertised FEC modes : %s", "25GBASE-R");
+ hw->cur_fec_link = TXGBE_PHY_FEC_BASER;
+ } else if (hw->fec_mode & (TXGBE_10G_FEC_REQ)) {
+ /* FEC: FC-FEC/BASE-R */
+ wr32_epcs(hw, 0x100ab, BIT(0));
+ BP_LOG("Epcs Write A: 0x%x, D: 0x%x\n", 0x100ab, 1);
+ PMD_DRV_LOG(INFO, "Advertised FEC modes : %s", "BASE-R");
+ hw->cur_fec_link = TXGBE_PHY_FEC_BASER;
+ } else {
+ PMD_DRV_LOG(INFO, "Advertised FEC modes : %s", "NONE");
+ hw->cur_fec_link = TXGBE_PHY_FEC_OFF;
+ }
+
+ status = txgbe_e56_cl72_trainning(hw);
+
+ rdata = rd32_ephy(hw, E56PHY_RXS_IDLE_DETECT_1_ADDR);
+ set_fields_e56(&rdata, E56PHY_RXS_IDLE_DETECT_1_IDLE_TH_ADC_PEAK_MAX, 0x28);
+ set_fields_e56(&rdata, E56PHY_RXS_IDLE_DETECT_1_IDLE_TH_ADC_PEAK_MIN, 0xa);
+ wr32_ephy(hw, E56PHY_RXS_IDLE_DETECT_1_ADDR, rdata);
+ wr32_ephy(hw, E56PHY_INTR_0_ADDR, E56PHY_INTR_0_IDLE_ENTRY1);
+ wr32_ephy(hw, E56PHY_INTR_1_ADDR, E56PHY_INTR_1_IDLE_EXIT1);
+ wr32_ephy(hw, E56PHY_INTR_0_ENABLE_ADDR, E56PHY_INTR_0_IDLE_ENTRY1);
+ wr32_ephy(hw, E56PHY_INTR_1_ENABLE_ADDR, E56PHY_INTR_1_IDLE_EXIT1);
+
+ return status;
+}
diff --git a/drivers/net/txgbe/base/txgbe_e56_bp.h b/drivers/net/txgbe/base/txgbe_e56_bp.h
index 97d5656cad..9329387334 100644
--- a/drivers/net/txgbe/base/txgbe_e56_bp.h
+++ b/drivers/net/txgbe/base/txgbe_e56_bp.h
@@ -276,4 +276,7 @@ typedef union {
#define E56PHY_CMS_ANA_OVRDVAL_10_ADDR (E56PHY_CMS_BASE_ADDR + 0xD8)
#define E56PHY_CMS_ANA_OVRDVAL_7_ANA_LCPLL_LF_LPF_SETCODE_CALIB_I 8, 4
+int txgbe_e56_set_phy_link_mode(struct txgbe_hw *hw,
+ u8 bp_link_mode, u32 need_restart);
+int handle_e56_bkp_an73_flow(struct txgbe_hw *hw);
#endif
diff --git a/drivers/net/txgbe/base/txgbe_hw.c b/drivers/net/txgbe/base/txgbe_hw.c
index 7b6937b9ca..8b7cbd592a 100644
--- a/drivers/net/txgbe/base/txgbe_hw.c
+++ b/drivers/net/txgbe/base/txgbe_hw.c
@@ -4071,6 +4071,12 @@ s32 txgbe_reset_pipeline_raptor(struct txgbe_hw *hw)
return err;
}
+bool txgbe_is_backplane(struct txgbe_hw *hw)
+{
+ return hw->phy.get_media_type(hw) == txgbe_media_type_backplane ?
+ true : false;
+}
+
bool txgbe_gpio_ext_check(struct txgbe_hw *hw, u8 gpio_ext_mask)
{
u32 gpio_ext = rd32(hw, TXGBE_GPIOEXT);
diff --git a/drivers/net/txgbe/base/txgbe_hw.h b/drivers/net/txgbe/base/txgbe_hw.h
index bc34d639eb..b44190bc34 100644
--- a/drivers/net/txgbe/base/txgbe_hw.h
+++ b/drivers/net/txgbe/base/txgbe_hw.h
@@ -118,6 +118,6 @@ s32 txgbe_reinit_fdir_tables(struct txgbe_hw *hw);
bool txgbe_verify_lesm_fw_enabled_raptor(struct txgbe_hw *hw);
s32 txgbe_fmgr_cmd_op(struct txgbe_hw *hw, u32 cmd, u32 cmd_addr);
s32 txgbe_flash_read_dword(struct txgbe_hw *hw, u32 addr, u32 *data);
-s32 txgbe_e56_check_phy_link(struct txgbe_hw *hw, u32 *speed,
- bool *link_up);
+bool txgbe_is_backplane(struct txgbe_hw *hw);
+bool txgbe_gpio_ext_check(struct txgbe_hw *hw, u8 gpio_ext_mask);
#endif /* _TXGBE_HW_H_ */
diff --git a/drivers/net/txgbe/base/txgbe_osdep.h b/drivers/net/txgbe/base/txgbe_osdep.h
index f4282b3241..da069e94f6 100644
--- a/drivers/net/txgbe/base/txgbe_osdep.h
+++ b/drivers/net/txgbe/base/txgbe_osdep.h
@@ -162,6 +162,10 @@ static inline u64 REVERT_BIT_MASK64(u64 mask)
((mask & 0xFFFFFFFF00000000) >> 32);
}
+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
+#define GENMASK(h, l) \
+ (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+
#define IOMEM
#define BIT(nr) (1UL << (nr))
diff --git a/drivers/net/txgbe/base/txgbe_phy.c b/drivers/net/txgbe/base/txgbe_phy.c
index bf7260a295..f3e3491b30 100644
--- a/drivers/net/txgbe/base/txgbe_phy.c
+++ b/drivers/net/txgbe/base/txgbe_phy.c
@@ -2503,6 +2503,27 @@ void txgbe_set_phy_temp(struct txgbe_hw *hw)
}
}
+int txgbe_is_dac_cable(struct txgbe_hw *hw)
+{
+ if (hw->phy.sfp_type == txgbe_sfp_type_da_cu_core0 ||
+ hw->phy.sfp_type == txgbe_sfp_type_da_cu_core1 ||
+ hw->phy.sfp_type == txgbe_sfp_type_da_act_lmt_core0 ||
+ hw->phy.sfp_type == txgbe_sfp_type_da_act_lmt_core1 ||
+ hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core0 ||
+ hw->phy.sfp_type == txgbe_qsfp_type_40g_cu_core1)
+ return true;
+
+ return false;
+}
+
+int txgbe_xpcs_an_enabled(struct txgbe_hw *hw)
+{
+ if (!(txgbe_is_dac_cable(hw) || txgbe_is_backplane(hw)))
+ return false;
+
+ return hw->devarg.auto_neg ? true : false;
+}
+
/**
* txgbe_kr_handle - Handle the interrupt of auto-negotiation
* @hw: pointer to hardware structure
diff --git a/drivers/net/txgbe/base/txgbe_phy.h b/drivers/net/txgbe/base/txgbe_phy.h
index c02be3cc34..93a5ad18c1 100644
--- a/drivers/net/txgbe/base/txgbe_phy.h
+++ b/drivers/net/txgbe/base/txgbe_phy.h
@@ -105,6 +105,8 @@
#define VR_AN_INTR_CMPLT MS16(0, 0x1)
#define VR_AN_INTR_LINK MS16(1, 0x1)
#define VR_AN_INTR_PG_RCV MS16(2, 0x1)
+#define TXGBE_E56_AN_TXDIS MS16(3, 0x1)
+#define TXGBE_E56_AN_PG_RCV MS16(4, 0x1)
#define VR_AN_KR_MODE_CL 0x078003
#define VR_AN_KR_MODE_CL_PDET MS16(0, 0x1)
#define VR_XS_OR_PCS_MMD_DIGI_CTL1 0x038000
@@ -428,6 +430,24 @@
#define TXGBE_BP_M_NAUTO 0
#define TXGBE_BP_M_AUTO 1
+#define kr_read_poll(op, val, cond, sleep_us, \
+ times, args...) \
+({ \
+ unsigned long __sleep_us = (sleep_us); \
+ u32 __times = (times); \
+ u32 i; \
+ int __cond = 0; \
+ for (i = 0; i < __times; i++) { \
+ (val) = op(args); \
+ if (cond) { \
+ __cond = 1; \
+ break; \
+ } \
+ usec_delay(__sleep_us);\
+ } \
+ (__cond) ? 0 : -1; \
+})
+
#ifndef CL72_KRTR_PRBS_MODE_EN
#define CL72_KRTR_PRBS_MODE_EN 0xFFFF /* open kr prbs check */
#endif
@@ -490,6 +510,8 @@ void txgbe_autoc_write(struct txgbe_hw *hw, u64 value);
void txgbe_bp_mode_set(struct txgbe_hw *hw);
void txgbe_set_phy_temp(struct txgbe_hw *hw);
void txgbe_bp_down_event(struct txgbe_hw *hw);
+int txgbe_is_dac_cable(struct txgbe_hw *hw);
+int txgbe_xpcs_an_enabled(struct txgbe_hw *hw);
s32 txgbe_kr_handle(struct txgbe_hw *hw);
#endif /* _TXGBE_PHY_H_ */
diff --git a/drivers/net/txgbe/base/txgbe_type.h b/drivers/net/txgbe/base/txgbe_type.h
index 7fb4bcc513..47629aa9e0 100644
--- a/drivers/net/txgbe/base/txgbe_type.h
+++ b/drivers/net/txgbe/base/txgbe_type.h
@@ -719,6 +719,7 @@ struct txgbe_phy_info {
u32 addr;
u32 id;
enum txgbe_sfp_type sfp_type;
+ u32 fiber_suppport_speed;
bool sfp_setup_needed;
u32 revision;
u32 media_type;
@@ -740,6 +741,7 @@ struct txgbe_phy_info {
u16 ffe_pre2;
u16 ffe_post;
u16 fec_mode;
+ u16 bp_capa;
};
#define TXGBE_DEVARG_BP_AUTO "auto_neg"
@@ -899,7 +901,28 @@ struct txgbe_hw {
u32 cur_fec_link;
int temperature;
u32 bp_link_mode;
-};
+ bool dac_sfp;
+ bool bypass_ctle;
+ u32 curbp_link_mode;
+ bool an_done;
+ u32 fsm;
+ u64 bp_event_interval;
+};
+
+typedef enum {
+ ABILITY_1000BASE_KX,
+ ABILITY_10GBASE_KX4,
+ ABILITY_10GBASE_KR,
+ ABILITY_40GBASE_KR4,
+ ABILITY_40GBASE_CR4,
+ ABILITY_100GBASE_CR10,
+ ABILITY_100GBASE_KP4,
+ ABILITY_100GBASE_KR4,
+ ABILITY_100GBASE_CR4,
+ ABILITY_25GBASE_KRCR_S,
+ ABILITY_25GBASE_KRCR,
+ ABILITY_MAX,
+} ability_filed_encding;
struct txgbe_backplane_ability {
u32 next_page; /* Next Page (bit0) */
diff --git a/drivers/net/txgbe/txgbe_ethdev.c b/drivers/net/txgbe/txgbe_ethdev.c
index f432e6ce25..f1119cf6f8 100644
--- a/drivers/net/txgbe/txgbe_ethdev.c
+++ b/drivers/net/txgbe/txgbe_ethdev.c
@@ -2010,6 +2010,10 @@ txgbe_dev_start(struct rte_eth_dev *dev)
txgbe_l2_tunnel_conf(dev);
txgbe_filter_restore(dev);
+ hw->bp_event_interval = 100 * 1000;
+ if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40)
+ rte_eal_alarm_set(hw->bp_event_interval, txgbe_dev_e56_check_bp_event, dev);
+
if (tm_conf->root && !tm_conf->committed)
PMD_DRV_LOG(WARNING,
"please call hierarchy_commit() "
@@ -2054,8 +2058,10 @@ txgbe_dev_stop(struct rte_eth_dev *dev)
PMD_INIT_FUNC_TRACE();
- if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40)
+ if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40) {
+ rte_eal_alarm_cancel(txgbe_dev_e56_check_bp_event, dev);
rte_eal_alarm_cancel(txgbe_dev_setup_link_alarm_handler_aml, hw);
+ }
rte_eal_alarm_cancel(txgbe_dev_detect_sfp, dev);
rte_eal_alarm_cancel(txgbe_tx_queue_clear_error, dev);
@@ -2926,6 +2932,107 @@ txgbe_dev_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
return NULL;
}
+void txgbe_dev_e56_check_bp_event(void *param)
+{
+ struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+ struct txgbe_hw *hw = TXGBE_DEV_HW(dev);
+ u32 an_int1 = 0, value = 0, fsm = 0;
+ u32 __rte_unused an_int = 0;
+ int ret = 0;
+ bool need_link_update = false;
+
+ if (!hw)
+ return;
+
+ if (!(txgbe_xpcs_an_enabled(hw)))
+ return;
+
+ if (!hw->devarg.auto_neg)
+ return;
+
+ /* only continue if link is down */
+ if (dev->data->dev_link.link_status)
+ goto out;
+
+ value = rd32_epcs(hw, VR_AN_INTR);
+ an_int = value;
+ if (value & 0xF)
+ hw->bp_event_interval = 100 * 1000;
+
+ if (value & VR_AN_INTR_CMPLT) {
+ hw->an_done = true;
+ need_link_update = true;
+ value &= ~VR_AN_INTR_CMPLT;
+ wr32_epcs(hw, VR_AN_INTR, value);
+ }
+
+ if (value & VR_AN_INTR_LINK) {
+ value &= ~VR_AN_INTR_LINK;
+ wr32_epcs(hw, VR_AN_INTR, value);
+ }
+
+ if (value & TXGBE_E56_AN_TXDIS) {
+ value &= ~TXGBE_E56_AN_TXDIS;
+ wr32_epcs(hw, VR_AN_INTR, value);
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_set_phy_link_mode(hw, 10, hw->bypass_ctle);
+ rte_spinlock_unlock(&hw->phy_lock);
+ goto an_status;
+ }
+
+ if (value & VR_AN_INTR_PG_RCV) {
+ BP_LOG("%d Enter training\n", hw->port_id);
+ ret = handle_e56_bkp_an73_flow(hw);
+ if (!AN_TRAINNING_MODE) {
+ fsm = rd32_epcs(hw, 0x78010);
+ if (fsm & 0x8)
+ goto an_status;
+ if (ret) {
+ BP_LOG("Training FAILED, do reset\n");
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_set_phy_link_mode(hw, 10, hw->bypass_ctle);
+ rte_spinlock_unlock(&hw->phy_lock);
+ } else {
+ BP_LOG("ALL SUCCEEDED\n");
+ }
+ } else {
+ if (ret) {
+ BP_LOG("Training FAILED, do reset\n");
+ rte_spinlock_lock(&hw->phy_lock);
+ txgbe_e56_set_phy_link_mode(hw, 10, hw->bypass_ctle);
+ rte_spinlock_unlock(&hw->phy_lock);
+ } else {
+ hw->an_done = true;
+ }
+ }
+ }
+
+an_status:
+ an_int1 = rd32_epcs(hw, 0x78002);
+ if (an_int1 & VR_AN_INTR_CMPLT) {
+ hw->an_done = true;
+ need_link_update = true;
+ }
+
+ BP_LOG("%d RLU:%x MLU:%x INT:%x-%x CTL:%x fsm:%x pmd_cfg0:%x an_done:%d\n",
+ hw->port_id, rd32_epcs(hw, 0x30001), rd32(hw, 0x14404),
+ an_int, an_int1,
+ rd32_epcs(hw, 0x70000),
+ rd32_epcs(hw, 0x78010),
+ rd32_ephy(hw, 0x1400),
+ hw->an_done);
+
+ if (need_link_update)
+ txgbe_dev_link_update(dev, 0);
+
+ if (dev->data->dev_link.link_status)
+ hw->bp_event_interval = 2000 * 1000;
+
+out:
+ if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40)
+ rte_eal_alarm_set(hw->bp_event_interval, txgbe_dev_e56_check_bp_event, dev);
+}
+
static void
txgbe_dev_detect_sfp(void *param)
{
diff --git a/drivers/net/txgbe/txgbe_ethdev.h b/drivers/net/txgbe/txgbe_ethdev.h
index 1ec8e096cc..309db3bfe9 100644
--- a/drivers/net/txgbe/txgbe_ethdev.h
+++ b/drivers/net/txgbe/txgbe_ethdev.h
@@ -747,5 +747,5 @@ void txgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev,
uint16_t queue, bool on);
void txgbe_config_vlan_strip_on_all_queues(struct rte_eth_dev *dev,
int mask);
-
+void txgbe_dev_e56_check_bp_event(void *param);
#endif /* _TXGBE_ETHDEV_H_ */
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 15/21] net/txgbe: fix FEC mode configuration on 25G NIC
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
The 25G NIC offers off, RS, Base-R, and auto FEC modes. When
reconfiguring the PHY, the FEC mode must match on both sides;
otherwise, the link cannot come up. The current driver fails to
maintain this requirement, causing link instability.
Add proper FEC mode handling during PHY reconfiguration to
guarantee link establishment.
Fixes: fb6eb170dfa2 ("net/txgbe: add basic link configuration for Amber-Lite")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/base/txgbe_aml.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/net/txgbe/base/txgbe_aml.c b/drivers/net/txgbe/base/txgbe_aml.c
index 5d449a0bd9..ac80d85f08 100644
--- a/drivers/net/txgbe/base/txgbe_aml.c
+++ b/drivers/net/txgbe/base/txgbe_aml.c
@@ -282,6 +282,14 @@ s32 txgbe_setup_phy_link_aml(struct txgbe_hw *hw,
!(hw->fec_mode & hw->cur_fec_link)))
goto out;
+ if (speed == TXGBE_LINK_SPEED_25GB_FULL &&
+ link_speed == TXGBE_LINK_SPEED_25GB_FULL) {
+ txgbe_e56_fec_polling(hw, &link_up);
+
+ if (link_up)
+ goto out;
+ }
+
rte_spinlock_lock(&hw->phy_lock);
ret_status = txgbe_set_link_to_amlite(hw, speed);
rte_spinlock_unlock(&hw->phy_lock);
@@ -360,7 +368,10 @@ static s32 txgbe_setup_mac_link_multispeed_fiber_aml(struct txgbe_hw *hw,
/* If we already have link at this speed, just jump out */
txgbe_e56_check_phy_link(hw, &link_speed, &link_up);
- if (link_speed == TXGBE_LINK_SPEED_25GB_FULL && link_up)
+ hw->cur_fec_link = txgbe_phy_fec_get(hw);
+
+ if (link_speed == TXGBE_LINK_SPEED_25GB_FULL && link_up &&
+ hw->fec_mode & hw->cur_fec_link)
goto out;
/* Allow module to change analog characteristics (10G -> 25G) */
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 16/21] net/txgbe: fix SFP module identification
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
Some optical modules were not correctly recognized due to ambiguous
classification in the original detection flow. Rework the module
identification logic to cover all module types. Also narrow the
I2C lock scope to avoid potential race conditions during module
access.
Fixes: ab191e6d9189 ("net/txgbe: support new SFP/QSFP modules")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/base/txgbe_hw.c | 2 -
drivers/net/txgbe/base/txgbe_phy.c | 341 ++++++++++------------------
drivers/net/txgbe/base/txgbe_phy.h | 18 +-
drivers/net/txgbe/base/txgbe_type.h | 2 +
4 files changed, 134 insertions(+), 229 deletions(-)
diff --git a/drivers/net/txgbe/base/txgbe_hw.c b/drivers/net/txgbe/base/txgbe_hw.c
index 8b7cbd592a..c84656e206 100644
--- a/drivers/net/txgbe/base/txgbe_hw.c
+++ b/drivers/net/txgbe/base/txgbe_hw.c
@@ -2909,8 +2909,6 @@ s32 txgbe_init_ops_generic(struct txgbe_hw *hw)
phy->read_i2c_eeprom = txgbe_read_i2c_eeprom;
phy->write_i2c_eeprom = txgbe_write_i2c_eeprom;
phy->identify_sfp = txgbe_identify_module;
- phy->read_i2c_byte_unlocked = txgbe_read_i2c_byte_unlocked;
- phy->write_i2c_byte_unlocked = txgbe_write_i2c_byte_unlocked;
phy->check_overtemp = txgbe_check_overtemp;
phy->reset = txgbe_reset_phy;
phy->set_link_hostif = txgbe_hic_ephy_set_link;
diff --git a/drivers/net/txgbe/base/txgbe_phy.c b/drivers/net/txgbe/base/txgbe_phy.c
index f3e3491b30..2fbe50e242 100644
--- a/drivers/net/txgbe/base/txgbe_phy.c
+++ b/drivers/net/txgbe/base/txgbe_phy.c
@@ -830,6 +830,10 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
return TXGBE_ERR_SFP_NOT_PRESENT;
}
+ err = hw->mac.acquire_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
+ if (err)
+ return -EBUSY;
+
err = hw->phy.read_i2c_eeprom(hw, TXGBE_SFF_IDENTIFIER,
&identifier);
if (err != 0) {
@@ -839,11 +843,13 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
hw->phy.id = 0;
hw->phy.type = txgbe_phy_unknown;
}
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return TXGBE_ERR_SFP_NOT_PRESENT;
}
if (identifier != TXGBE_SFF_IDENTIFIER_SFP) {
hw->phy.type = txgbe_phy_sfp_unsupported;
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return TXGBE_ERR_SFP_NOT_SUPPORTED;
}
@@ -888,7 +894,42 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
* 11 SFP_1g_sx_CORE0 - chip-specific
* 12 SFP_1g_sx_CORE1 - chip-specific
*/
- if (cable_tech & TXGBE_SFF_CABLE_DA_ACTIVE) {
+ if (cable_tech & TXGBE_SFF_CABLE_DA_PASSIVE) {
+ if (hw->bus.lan_id == 0)
+ hw->phy.sfp_type = txgbe_sfp_type_da_cu_core0;
+ else
+ hw->phy.sfp_type = txgbe_sfp_type_da_cu_core1;
+
+ if (hw->phy.sfp_type == txgbe_sfp_type_da_cu_core0 ||
+ hw->phy.sfp_type == txgbe_sfp_type_da_cu_core1) {
+ hw->dac_sfp = true;
+ }
+
+ if (comp_copper_len == TXGBE_SFF_COPPER_1M)
+ hw->bypass_ctle = true;
+ else
+ hw->bypass_ctle = false;
+
+ if (comp_codes_25g == TXGBE_SFF_25GBASECR_91FEC ||
+ comp_codes_25g == TXGBE_SFF_25GBASECR_74FEC ||
+ comp_codes_25g == TXGBE_SFF_25GBASECR_NOFEC) {
+ hw->phy.fiber_suppport_speed =
+ TXGBE_LINK_SPEED_25GB_FULL |
+ TXGBE_LINK_SPEED_10GB_FULL;
+ } else {
+ hw->phy.fiber_suppport_speed |=
+ TXGBE_LINK_SPEED_10GB_FULL;
+ }
+ } else if (comp_codes_25g == TXGBE_SFF_25GAUI_C2M_AOC_BER_5 ||
+ comp_codes_25g == TXGBE_SFF_25GAUI_C2M_ACC_BER_5 ||
+ comp_codes_25g == TXGBE_SFF_25GAUI_C2M_AOC_BER_12 ||
+ comp_codes_25g == TXGBE_SFF_25GAUI_C2M_ACC_BER_12) {
+ hw->dac_sfp = false;
+ hw->phy.sfp_type = (hw->bus.lan_id == 0
+ ? txgbe_sfp_type_25g_aoc_core0
+ : txgbe_sfp_type_25g_aoc_core1);
+ } else if (cable_tech & TXGBE_SFF_CABLE_DA_ACTIVE) {
+ hw->dac_sfp = false;
err = hw->phy.read_i2c_eeprom(hw,
TXGBE_SFF_CABLE_SPEC_COMP, &cable_spec);
if (err != 0)
@@ -1005,6 +1046,7 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
/* Allow any DA cable vendor */
if (cable_tech & (TXGBE_SFF_CABLE_DA_PASSIVE |
TXGBE_SFF_CABLE_DA_ACTIVE)) {
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return 0;
}
@@ -1017,6 +1059,7 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
hw->phy.sfp_type == txgbe_sfp_type_1g_sx_core0 ||
hw->phy.sfp_type == txgbe_sfp_type_1g_sx_core1)) {
hw->phy.type = txgbe_phy_sfp_unsupported;
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return TXGBE_ERR_SFP_NOT_SUPPORTED;
}
@@ -1031,9 +1074,11 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
hw->phy.sfp_type == txgbe_sfp_type_1g_sx_core1)) {
DEBUGOUT("SFP+ module not supported");
hw->phy.type = txgbe_phy_sfp_unsupported;
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return TXGBE_ERR_SFP_NOT_SUPPORTED;
}
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return err;
}
@@ -1046,28 +1091,13 @@ s32 txgbe_identify_sfp_module(struct txgbe_hw *hw)
s32 txgbe_identify_qsfp_module(struct txgbe_hw *hw)
{
s32 err = TXGBE_ERR_PHY_ADDR_INVALID;
- u32 vendor_oui = 0;
- enum txgbe_sfp_type stored_sfp_type = hw->phy.sfp_type;
- u8 identifier = 0;
- u8 comp_codes_1g = 0;
- u8 comp_codes_10g = 0;
- u8 oui_bytes[3] = {0, 0, 0};
- u16 enforce_sfp = 0;
- u8 connector = 0;
- u8 cable_length = 0;
- u8 device_tech = 0;
- bool active_cable = false;
+ u8 identifier = 0, transceiver_type = 0;
u32 value;
- if (hw->phy.media_type != txgbe_media_type_fiber_qsfp) {
- hw->phy.sfp_type = txgbe_sfp_type_not_present;
- err = TXGBE_ERR_SFP_NOT_PRESENT;
- goto out;
- }
+ /* config GPIO before read i2c */
+ wr32(hw, TXGBE_GPIODATA, TXGBE_GPIOBIT_1);
if (hw->mac.type == txgbe_mac_aml40) {
- /* config GPIO before read i2c */
- wr32(hw, TXGBE_GPIODATA, TXGBE_GPIOBIT_1);
value = rd32(hw, TXGBE_GPIOEXT);
if (value & TXGBE_SFP1_MOD_PRST_LS) {
hw->phy.sfp_type = txgbe_sfp_type_not_present;
@@ -1075,175 +1105,68 @@ s32 txgbe_identify_qsfp_module(struct txgbe_hw *hw)
}
}
- err = hw->phy.read_i2c_eeprom(hw, TXGBE_SFF_IDENTIFIER,
- &identifier);
-ERR_I2C:
- if (err != 0) {
+ if (hw->phy.media_type != txgbe_media_type_fiber_qsfp) {
hw->phy.sfp_type = txgbe_sfp_type_not_present;
- hw->phy.id = 0;
- hw->phy.type = txgbe_phy_unknown;
return TXGBE_ERR_SFP_NOT_PRESENT;
}
- if (identifier != TXGBE_SFF_IDENTIFIER_QSFP_PLUS) {
- hw->phy.type = txgbe_phy_sfp_unsupported;
- err = TXGBE_ERR_SFP_NOT_SUPPORTED;
- goto out;
- }
-
- hw->phy.id = identifier;
-
- err = hw->phy.read_i2c_eeprom(hw, TXGBE_SFF_QSFP_10GBE_COMP,
- &comp_codes_10g);
+ err = hw->mac.acquire_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
if (err != 0)
- goto ERR_I2C;
+ return -EBUSY;
- err = hw->phy.read_i2c_eeprom(hw, TXGBE_SFF_QSFP_1GBE_COMP,
- &comp_codes_1g);
+ err = hw->phy.read_i2c_sff8636(hw, 0, TXGBE_SFF_IDENTIFIER,
+ &identifier);
if (err != 0)
- goto ERR_I2C;
+ goto err_read_i2c_eeprom;
- if (comp_codes_10g & TXGBE_SFF_QSFP_DA_PASSIVE_CABLE) {
- hw->phy.type = txgbe_phy_qsfp_unknown_passive;
- if (hw->mac.type == txgbe_mac_aml40) {
+ if (identifier != TXGBE_SFF_IDENTIFIER_QSFP &&
+ identifier != TXGBE_SFF_IDENTIFIER_QSFP_PLUS) {
+ PMD_INIT_LOG(ERR, "port[%d] QSFP module not supported, identifier = 0x%x",
+ hw->bus.lan_id, identifier);
+ hw->phy.type = txgbe_phy_sfp_unsupported;
+ err = TXGBE_ERR_SFP_NOT_SUPPORTED;
+ } else {
+ err = hw->phy.read_i2c_sff8636(hw, 0,
+ TXGBE_ETHERNET_COMP_OFFSET,
+ &transceiver_type);
+ if (err != 0)
+ goto err_read_i2c_eeprom;
+
+ if (transceiver_type & TXGBE_SFF_ETHERNET_40G_CR4) {
if (hw->bus.lan_id == 0)
hw->phy.sfp_type = txgbe_qsfp_type_40g_cu_core0;
else
hw->phy.sfp_type = txgbe_qsfp_type_40g_cu_core1;
- } else {
- if (hw->bus.lan_id == 0)
- hw->phy.sfp_type = txgbe_sfp_type_da_cu_core0;
- else
- hw->phy.sfp_type = txgbe_sfp_type_da_cu_core1;
- }
- } else if (comp_codes_10g & TXGBE_SFF_40GBASE_SR4) {
- if (hw->bus.lan_id == 0)
- hw->phy.sfp_type = txgbe_qsfp_type_40g_sr_core0;
- else
- hw->phy.sfp_type = txgbe_qsfp_type_40g_sr_core1;
- } else if (comp_codes_10g & TXGBE_SFF_40GBASE_LR4) {
- if (hw->bus.lan_id == 0)
- hw->phy.sfp_type = txgbe_qsfp_type_40g_lr_core0;
- else
- hw->phy.sfp_type = txgbe_qsfp_type_40g_lr_core1;
- } else if (comp_codes_10g & (TXGBE_SFF_10GBASESR_CAPABLE |
- TXGBE_SFF_10GBASELR_CAPABLE)) {
- if (hw->bus.lan_id == 0)
- hw->phy.sfp_type = txgbe_sfp_type_srlr_core0;
- else
- hw->phy.sfp_type = txgbe_sfp_type_srlr_core1;
- } else {
- if (comp_codes_10g & TXGBE_SFF_QSFP_DA_ACTIVE_CABLE)
- active_cable = true;
-
- if (!active_cable) {
- hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_QSFP_CONNECTOR,
- &connector);
-
- hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_QSFP_CABLE_LENGTH,
- &cable_length);
-
- hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_QSFP_DEVICE_TECH,
- &device_tech);
-
- if (connector ==
- TXGBE_SFF_QSFP_CONNECTOR_NOT_SEPARABLE &&
- cable_length > 0 &&
- ((device_tech >> 4) ==
- TXGBE_SFF_QSFP_TRANSMITTER_850NM_VCSEL))
- active_cable = true;
+ hw->phy.fiber_suppport_speed =
+ TXGBE_LINK_SPEED_40GB_FULL |
+ TXGBE_LINK_SPEED_10GB_FULL;
}
- if (active_cable) {
- hw->phy.type = txgbe_phy_qsfp_unknown_active;
+ if (transceiver_type & TXGBE_SFF_ETHERNET_40G_SR4) {
if (hw->bus.lan_id == 0)
- hw->phy.sfp_type =
- txgbe_sfp_type_da_act_lmt_core0;
+ hw->phy.sfp_type = txgbe_qsfp_type_40g_sr_core0;
else
- hw->phy.sfp_type =
- txgbe_sfp_type_da_act_lmt_core1;
- } else {
- /* unsupported module type */
- hw->phy.type = txgbe_phy_sfp_unsupported;
- err = TXGBE_ERR_SFP_NOT_SUPPORTED;
- goto out;
+ hw->phy.sfp_type = txgbe_qsfp_type_40g_sr_core1;
}
- }
-
- if (hw->phy.sfp_type != stored_sfp_type)
- hw->phy.sfp_setup_needed = true;
-
- /* Determine if the QSFP+ PHY is dual speed or not. */
- hw->phy.multispeed_fiber = false;
- if (((comp_codes_1g & TXGBE_SFF_1GBASESX_CAPABLE) &&
- (comp_codes_10g & TXGBE_SFF_10GBASESR_CAPABLE)) ||
- ((comp_codes_1g & TXGBE_SFF_1GBASELX_CAPABLE) &&
- (comp_codes_10g & TXGBE_SFF_10GBASELR_CAPABLE)))
- hw->phy.multispeed_fiber = true;
-
- /* Determine PHY vendor for optical modules */
- if (comp_codes_10g & (TXGBE_SFF_10GBASESR_CAPABLE |
- TXGBE_SFF_10GBASELR_CAPABLE)) {
- err = hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_QSFP_VENDOR_OUI_BYTE0,
- &oui_bytes[0]);
-
- if (err != 0)
- goto ERR_I2C;
-
- err = hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_QSFP_VENDOR_OUI_BYTE1,
- &oui_bytes[1]);
-
- if (err != 0)
- goto ERR_I2C;
- err = hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_QSFP_VENDOR_OUI_BYTE2,
- &oui_bytes[2]);
-
- if (err != 0)
- goto ERR_I2C;
-
- vendor_oui =
- ((oui_bytes[0] << 24) |
- (oui_bytes[1] << 16) |
- (oui_bytes[2] << 8));
-
- if (vendor_oui == TXGBE_SFF_VENDOR_OUI_INTEL)
- hw->phy.type = txgbe_phy_qsfp_intel;
- else
- hw->phy.type = txgbe_phy_qsfp_unknown;
-
- hw->mac.get_device_caps(hw, &enforce_sfp);
- if (!(enforce_sfp & TXGBE_DEVICE_CAPS_ALLOW_ANY_SFP)) {
- /* Make sure we're a supported PHY type */
- if (hw->phy.type == txgbe_phy_qsfp_intel) {
- err = 0;
- } else {
- if (hw->allow_unsupported_sfp) {
- DEBUGOUT("WARNING: Wangxun (R) Network Connections are quality tested using Wangxun (R) Ethernet Optics. "
- "Using untested modules is not supported and may cause unstable operation or damage to the module or the adapter. "
- "Wangxun Corporation is not responsible for any harm caused by using untested modules.");
- err = 0;
- } else {
- DEBUGOUT("QSFP module not supported");
- hw->phy.type =
- txgbe_phy_sfp_unsupported;
- err = TXGBE_ERR_SFP_NOT_SUPPORTED;
- }
- }
- } else {
- err = 0;
+ if (transceiver_type & TXGBE_SFF_ETHERNET_40G_LR4) {
+ if (hw->bus.lan_id == 0)
+ hw->phy.sfp_type = txgbe_qsfp_type_40g_lr_core0;
+ else
+ hw->phy.sfp_type = txgbe_qsfp_type_40g_lr_core1;
}
}
-out:
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return err;
+
+err_read_i2c_eeprom:
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
+ hw->phy.sfp_type = txgbe_sfp_type_not_present;
+ hw->phy.id = 0;
+ hw->phy.type = txgbe_phy_unknown;
+ return TXGBE_ERR_SFP_NOT_PRESENT;
}
/**
@@ -1278,6 +1201,28 @@ s32 txgbe_read_i2c_sff8472(struct txgbe_hw *hw, u8 byte_offset,
sff8472_data);
}
+/**
+ * txgbe_read_i2c_sff8636 - Reads 8 bit word over I2C interface
+ * @hw: pointer to hardware structure
+ * @byte_offset: byte offset at address 0xA2
+ * @eeprom_data: value read
+ *
+ * Performs byte read operation to SFP module's SFF-8472 data over I2C
+ **/
+s32 txgbe_read_i2c_sff8636(struct txgbe_hw *hw, u8 page, u8 byte_offset,
+ u8 *sff8636_data)
+{
+ s32 err = hw->phy.write_i2c_byte(hw, TXGBE_SFF_QSFP_PAGE_SELECT,
+ TXGBE_I2C_EEPROM_DEV_ADDR,
+ page);
+ if (err != 0)
+ return err;
+
+ return hw->phy.read_i2c_byte(hw, byte_offset,
+ TXGBE_I2C_EEPROM_DEV_ADDR,
+ sff8636_data);
+}
+
/**
* txgbe_write_i2c_eeprom - Writes 8 bit EEPROM word over I2C interface
* @hw: pointer to hardware structure
@@ -1295,7 +1240,7 @@ s32 txgbe_write_i2c_eeprom(struct txgbe_hw *hw, u8 byte_offset,
}
/**
- * txgbe_read_i2c_byte_unlocked - Reads 8 bit word over I2C
+ * txgbe_read_i2c_byte - Reads 8 bit word over I2C
* @hw: pointer to hardware structure
* @byte_offset: byte offset to read
* @dev_addr: address to read from
@@ -1304,7 +1249,7 @@ s32 txgbe_write_i2c_eeprom(struct txgbe_hw *hw, u8 byte_offset,
* Performs byte read operation to SFP module's EEPROM over I2C interface at
* a specified device address.
**/
-s32 txgbe_read_i2c_byte_unlocked(struct txgbe_hw *hw, u8 byte_offset,
+s32 txgbe_read_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
u8 dev_addr, u8 *data)
{
txgbe_i2c_start(hw, dev_addr);
@@ -1334,30 +1279,7 @@ s32 txgbe_read_i2c_byte_unlocked(struct txgbe_hw *hw, u8 byte_offset,
}
/**
- * txgbe_read_i2c_byte - Reads 8 bit word over I2C
- * @hw: pointer to hardware structure
- * @byte_offset: byte offset to read
- * @dev_addr: address to read from
- * @data: value read
- *
- * Performs byte read operation to SFP module's EEPROM over I2C interface at
- * a specified device address.
- **/
-s32 txgbe_read_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
- u8 dev_addr, u8 *data)
-{
- u32 swfw_mask = hw->phy.phy_semaphore_mask;
- int err = 0;
-
- if (hw->mac.acquire_swfw_sync(hw, swfw_mask))
- return TXGBE_ERR_SWFW_SYNC;
- err = txgbe_read_i2c_byte_unlocked(hw, byte_offset, dev_addr, data);
- hw->mac.release_swfw_sync(hw, swfw_mask);
- return err;
-}
-
-/**
- * txgbe_write_i2c_byte_unlocked - Writes 8 bit word over I2C
+ * txgbe_write_i2c_byte - Writes 8 bit word over I2C
* @hw: pointer to hardware structure
* @byte_offset: byte offset to write
* @dev_addr: address to write to
@@ -1366,54 +1288,29 @@ s32 txgbe_read_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
* Performs byte write operation to SFP module's EEPROM over I2C interface at
* a specified device address.
**/
-s32 txgbe_write_i2c_byte_unlocked(struct txgbe_hw *hw, u8 byte_offset,
- u8 dev_addr, u8 data)
+s32 txgbe_write_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
+ u8 dev_addr, u8 data)
{
txgbe_i2c_start(hw, dev_addr);
/* wait tx empty */
if (!po32m(hw, TXGBE_I2CICR, TXGBE_I2CICR_TXEMPTY,
- TXGBE_I2CICR_TXEMPTY, NULL, 100, 100)) {
+ TXGBE_I2CICR_TXEMPTY, NULL, 100, 100))
return -TERR_TIMEOUT;
- }
- wr32(hw, TXGBE_I2CDATA, byte_offset | TXGBE_I2CDATA_STOP);
+ wr32(hw, TXGBE_I2CDATA, byte_offset);
wr32(hw, TXGBE_I2CDATA, data | TXGBE_I2CDATA_WRITE);
/* wait for write complete */
if (!po32m(hw, TXGBE_I2CICR, TXGBE_I2CICR_RXFULL,
- TXGBE_I2CICR_RXFULL, NULL, 100, 100)) {
+ TXGBE_I2CICR_RXFULL, NULL, 100, 100))
return -TERR_TIMEOUT;
- }
+
txgbe_i2c_stop(hw);
return 0;
}
-/**
- * txgbe_write_i2c_byte - Writes 8 bit word over I2C
- * @hw: pointer to hardware structure
- * @byte_offset: byte offset to write
- * @dev_addr: address to write to
- * @data: value to write
- *
- * Performs byte write operation to SFP module's EEPROM over I2C interface at
- * a specified device address.
- **/
-s32 txgbe_write_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
- u8 dev_addr, u8 data)
-{
- u32 swfw_mask = hw->phy.phy_semaphore_mask;
- int err = 0;
-
- if (hw->mac.acquire_swfw_sync(hw, swfw_mask))
- return TXGBE_ERR_SWFW_SYNC;
- err = txgbe_write_i2c_byte_unlocked(hw, byte_offset, dev_addr, data);
- hw->mac.release_swfw_sync(hw, swfw_mask);
-
- return err;
-}
-
/**
* txgbe_i2c_start - Sets I2C start condition
* @hw: pointer to hardware structure
diff --git a/drivers/net/txgbe/base/txgbe_phy.h b/drivers/net/txgbe/base/txgbe_phy.h
index 93a5ad18c1..20c80d9d88 100644
--- a/drivers/net/txgbe/base/txgbe_phy.h
+++ b/drivers/net/txgbe/base/txgbe_phy.h
@@ -261,7 +261,9 @@
#define TXGBE_SFF_SFF_8472_COMP 0x5E
#define TXGBE_SFF_SFF_8472_OSCB 0x6E
#define TXGBE_SFF_SFF_8472_ESCB 0x76
+#define TXGBE_SFF_QSFP_PAGE_SELECT 0x7F
+#define TXGBE_SFF_IDENTIFIER_QSFP 0x0C
#define TXGBE_SFF_IDENTIFIER_QSFP_PLUS 0x0D
#define TXGBE_SFF_QSFP_VENDOR_OUI_BYTE0 0xA5
#define TXGBE_SFF_QSFP_VENDOR_OUI_BYTE1 0xA6
@@ -289,6 +291,9 @@
#define TXGBE_SFF_4x10GBASESR_CAP 0x11
#define TXGBE_SFF_40GBASEPSM4_PARALLEL 0x12
#define TXGBE_SFF_40GBASE_SWMD4_CAP 0x1f
+#define TXGBE_SFF_COPPER_5M 0x5
+#define TXGBE_SFF_COPPER_3M 0x3
+#define TXGBE_SFF_COPPER_1M 0x1
#define TXGBE_SFF_DA_SPEC_ACTIVE_LIMITING 0x4
#define TXGBE_SFF_25GAUI_C2M_AOC_BER_5 0x1
@@ -296,6 +301,11 @@
#define TXGBE_SFF_25GAUI_C2M_AOC_BER_12 0x18
#define TXGBE_SFF_25GAUI_C2M_ACC_BER_12 0x19
+#define TXGBE_ETHERNET_COMP_OFFSET 0x83
+#define TXGBE_SFF_ETHERNET_40G_CR4 MS(3, 0x1)
+#define TXGBE_SFF_ETHERNET_40G_SR4 MS(2, 0x1)
+#define TXGBE_SFF_ETHERNET_40G_LR4 MS(1, 0x1)
+
#define TXGBE_SFF_SOFT_RS_SELECT_MASK 0x8
#define TXGBE_SFF_SOFT_RS_SELECT_10G 0x8
#define TXGBE_SFF_SOFT_RS_SELECT_1G 0x0
@@ -493,14 +503,12 @@ s32 txgbe_identify_qsfp_module(struct txgbe_hw *hw);
s32 txgbe_check_overtemp(struct txgbe_hw *hw);
s32 txgbe_read_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
u8 dev_addr, u8 *data);
-s32 txgbe_read_i2c_byte_unlocked(struct txgbe_hw *hw, u8 byte_offset,
- u8 dev_addr, u8 *data);
s32 txgbe_write_i2c_byte(struct txgbe_hw *hw, u8 byte_offset,
u8 dev_addr, u8 data);
-s32 txgbe_write_i2c_byte_unlocked(struct txgbe_hw *hw, u8 byte_offset,
- u8 dev_addr, u8 data);
s32 txgbe_read_i2c_sff8472(struct txgbe_hw *hw, u8 byte_offset,
- u8 *sff8472_data);
+ u8 *sff8472_data);
+s32 txgbe_read_i2c_sff8636(struct txgbe_hw *hw, u8 page, u8 byte_offset,
+ u8 *sff8636_data);
s32 txgbe_read_i2c_eeprom(struct txgbe_hw *hw, u8 byte_offset,
u8 *eeprom_data);
s32 txgbe_write_i2c_eeprom(struct txgbe_hw *hw, u8 byte_offset,
diff --git a/drivers/net/txgbe/base/txgbe_type.h b/drivers/net/txgbe/base/txgbe_type.h
index 47629aa9e0..2e2d79e0e1 100644
--- a/drivers/net/txgbe/base/txgbe_type.h
+++ b/drivers/net/txgbe/base/txgbe_type.h
@@ -702,6 +702,8 @@ struct txgbe_phy_info {
u8 dev_addr, u8 data);
s32 (*read_i2c_sff8472)(struct txgbe_hw *hw, u8 byte_offset,
u8 *sff8472_data);
+ s32 (*read_i2c_sff8636)(struct txgbe_hw *hw, u8 page, u8 byte_offset,
+ u8 *sff8636_data);
s32 (*read_i2c_eeprom)(struct txgbe_hw *hw, u8 byte_offset,
u8 *eeprom_data);
s32 (*write_i2c_eeprom)(struct txgbe_hw *hw, u8 byte_offset,
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 17/21] net/txgbe: fix get module info operation
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
The original I2C access flow in the module information retrieval
process was flawed. Correct the implementation to properly fetch
module info.
Fixes: abf042d32b39 ("net/txgbe: add Amber-Lite 25G/40G NICs")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/base/txgbe_phy.h | 6 +-
drivers/net/txgbe/txgbe_ethdev.c | 118 +++++++++++++++++++++++------
2 files changed, 98 insertions(+), 26 deletions(-)
diff --git a/drivers/net/txgbe/base/txgbe_phy.h b/drivers/net/txgbe/base/txgbe_phy.h
index 20c80d9d88..31bdceb35b 100644
--- a/drivers/net/txgbe/base/txgbe_phy.h
+++ b/drivers/net/txgbe/base/txgbe_phy.h
@@ -258,10 +258,14 @@
#define TXGBE_SFF_CABLE_DA_ACTIVE 0x8
#define TXGBE_SFF_CABLE_SPEC_COMP 0x3C
#define TXGBE_SFF_SFF_8472_SWAP 0x5C
+#define TXGBE_SFF_DDM_IMPLEMENTED 0x40
#define TXGBE_SFF_SFF_8472_COMP 0x5E
#define TXGBE_SFF_SFF_8472_OSCB 0x6E
#define TXGBE_SFF_SFF_8472_ESCB 0x76
-#define TXGBE_SFF_QSFP_PAGE_SELECT 0x7F
+#define TXGBE_SFF_SFF_REVISION_ADDR 0x01
+#define TXGBE_SFF_QSFP_PAGE_SELECT 0x7F
+
+#define TXGBE_MODULE_QSFP_MAX_LEN 640
#define TXGBE_SFF_IDENTIFIER_QSFP 0x0C
#define TXGBE_SFF_IDENTIFIER_QSFP_PLUS 0x0D
diff --git a/drivers/net/txgbe/txgbe_ethdev.c b/drivers/net/txgbe/txgbe_ethdev.c
index f1119cf6f8..c34635c50a 100644
--- a/drivers/net/txgbe/txgbe_ethdev.c
+++ b/drivers/net/txgbe/txgbe_ethdev.c
@@ -5348,41 +5348,109 @@ txgbe_get_module_info(struct rte_eth_dev *dev,
struct txgbe_hw *hw = TXGBE_DEV_HW(dev);
uint32_t status;
uint8_t sff8472_rev, addr_mode;
+ uint8_t identifier;
+ uint8_t sff8636_rev;
bool page_swap = false;
+ uint32_t value;
- /* Check whether we support SFF-8472 or not */
- status = hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_SFF_8472_COMP,
- &sff8472_rev);
- if (status != 0)
- return -EIO;
+ if (hw->mac.type == txgbe_mac_aml40) {
+ value = rd32(hw, TXGBE_GPIOEXT);
+ if (value & TXGBE_SFP1_MOD_PRST_LS) {
+ PMD_DRV_LOG(WARNING, "QSFP module not present, cannot get module info.");
+ return -EINVAL;
+ }
+ }
+
+ if (hw->mac.type == txgbe_mac_aml) {
+ value = rd32(hw, TXGBE_GPIOEXT);
+ if (value & TXGBE_SFP1_MOD_ABS_LS) {
+ PMD_DRV_LOG(WARNING, "SFP module not present, cannot get module info.");
+ return -EINVAL;
+ }
+ }
- /* addressing mode is not supported */
- status = hw->phy.read_i2c_eeprom(hw,
- TXGBE_SFF_SFF_8472_SWAP,
- &addr_mode);
+ status = hw->mac.acquire_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
if (status != 0)
- return -EIO;
+ return -EBUSY;
- if (addr_mode & TXGBE_SFF_ADDRESSING_MODE) {
- PMD_DRV_LOG(ERR,
- "Address change required to access page 0xA2, "
- "but not supported. Please report the module "
- "type to the driver maintainers.");
- page_swap = true;
+ if (hw->mac.type == txgbe_mac_aml40) {
+ status = hw->phy.read_i2c_sff8636(hw, 0,
+ TXGBE_SFF_IDENTIFIER,
+ &identifier);
+ } else {
+ status = hw->phy.read_i2c_eeprom(hw,
+ TXGBE_SFF_IDENTIFIER,
+ &identifier);
}
- if (sff8472_rev == TXGBE_SFF_SFF_8472_UNSUP || page_swap) {
- /* We have a SFP, but it does not support SFF-8472 */
- modinfo->type = RTE_ETH_MODULE_SFF_8079;
- modinfo->eeprom_len = RTE_ETH_MODULE_SFF_8079_LEN;
- } else {
- /* We have a SFP which supports a revision of SFF-8472. */
- modinfo->type = RTE_ETH_MODULE_SFF_8472;
- modinfo->eeprom_len = RTE_ETH_MODULE_SFF_8472_LEN;
+ if (status != 0)
+ goto ERROR_IO;
+
+ switch (identifier) {
+ case TXGBE_SFF_IDENTIFIER_SFP:
+ /* Check whether we support SFF-8472 or not */
+ status = hw->phy.read_i2c_eeprom(hw,
+ TXGBE_SFF_SFF_8472_COMP,
+ &sff8472_rev);
+ if (status != 0)
+ goto ERROR_IO;
+
+ /* addressing mode is not supported */
+ status = hw->phy.read_i2c_eeprom(hw,
+ TXGBE_SFF_SFF_8472_SWAP,
+ &addr_mode);
+ if (status != 0)
+ goto ERROR_IO;
+
+ if (addr_mode & TXGBE_SFF_ADDRESSING_MODE) {
+ PMD_DRV_LOG(ERR,
+ "Address change required to access page 0xA2, "
+ "but not supported. Please report the module "
+ "type to the driver maintainers.");
+ page_swap = true;
+ }
+
+ if (sff8472_rev == TXGBE_SFF_SFF_8472_UNSUP || page_swap ||
+ !(addr_mode & TXGBE_SFF_DDM_IMPLEMENTED)) {
+ /* We have a SFP, but it does not support SFF-8472 */
+ modinfo->type = RTE_ETH_MODULE_SFF_8079;
+ modinfo->eeprom_len = RTE_ETH_MODULE_SFF_8079_LEN;
+ } else {
+ /* We have a SFP which supports a revision of SFF-8472. */
+ modinfo->type = RTE_ETH_MODULE_SFF_8472;
+ modinfo->eeprom_len = RTE_ETH_MODULE_SFF_8472_LEN;
+ }
+ break;
+ case TXGBE_SFF_IDENTIFIER_QSFP:
+ case TXGBE_SFF_IDENTIFIER_QSFP_PLUS:
+ status = hw->phy.read_i2c_sff8636(hw, 0,
+ TXGBE_SFF_SFF_REVISION_ADDR,
+ &sff8636_rev);
+ if (status != 0)
+ goto ERROR_IO;
+ /* Check revision compliance */
+ if (sff8636_rev > 0x02) {
+ /* Module is SFF-8636 compliant */
+ modinfo->type = RTE_ETH_MODULE_SFF_8636;
+ modinfo->eeprom_len = TXGBE_MODULE_QSFP_MAX_LEN;
+ } else {
+ modinfo->type = RTE_ETH_MODULE_SFF_8436;
+ modinfo->eeprom_len = TXGBE_MODULE_QSFP_MAX_LEN;
+ }
+ break;
+ default:
+ PMD_DRV_LOG(ERR, "SFF Module Type not recognized, identifier=0x%x", identifier);
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
+ return -EINVAL;
}
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return 0;
+
+ERROR_IO:
+ PMD_DRV_LOG(ERR, "I2C IO ERROR.");
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
+ return -EIO;
}
static int
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 19/21] net/txgbe: fix to reset Tx write-back pointer
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
The write-back pointer was not reset when the Tx queue was reset. This
leads to the wrong Tx desc free logic. Move the resetting of pointer into
txq->ops->reset(txq).
Fixes: 8ada71d0bb7f ("net/txgbe: add Tx head write-back mode for Amber-Lite")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/txgbe_rxtx.c | 45 +++++++++++++----------
drivers/net/txgbe/txgbe_rxtx.h | 1 +
drivers/net/txgbe/txgbe_rxtx_vec_common.h | 7 ++++
3 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/drivers/net/txgbe/txgbe_rxtx.c b/drivers/net/txgbe/txgbe_rxtx.c
index 4611124b68..ed34b3a38c 100644
--- a/drivers/net/txgbe/txgbe_rxtx.c
+++ b/drivers/net/txgbe/txgbe_rxtx.c
@@ -2313,6 +2313,12 @@ txgbe_reset_tx_queue(struct txgbe_tx_queue *txq)
txq->tx_next_dd = (uint16_t)(txq->tx_free_thresh - 1);
txq->tx_tail = 0;
+ /* Zero out headwb_mem memory */
+ if (txq->headwb_mem) {
+ for (i = 0; i < txq->headwb_size; i++)
+ txq->headwb_mem[i] = 0;
+ }
+
/*
* Always allow 1 descriptor to be un-allocated to avoid
* a H/W race condition
@@ -2412,7 +2418,7 @@ txgbe_get_tx_port_offloads(struct rte_eth_dev *dev)
return tx_offload_capa;
}
-static int
+static void
txgbe_setup_headwb_resources(struct rte_eth_dev *dev,
void *tx_queue,
unsigned int socket_id)
@@ -2420,33 +2426,33 @@ txgbe_setup_headwb_resources(struct rte_eth_dev *dev,
struct txgbe_hw *hw = TXGBE_DEV_HW(dev);
const struct rte_memzone *headwb;
struct txgbe_tx_queue *txq = tx_queue;
- u8 i, headwb_size = 0;
+ u8 headwb_size = 0;
- if (hw->mac.type != txgbe_mac_aml && hw->mac.type != txgbe_mac_aml40) {
- txq->headwb_mem = NULL;
- return 0;
- }
+ if (hw->mac.type != txgbe_mac_aml && hw->mac.type != txgbe_mac_aml40)
+ goto out;
+
+ if (!hw->devarg.tx_headwb)
+ goto out;
- headwb_size = hw->devarg.tx_headwb_size;
+ headwb_size = txq->headwb_size;
headwb = rte_eth_dma_zone_reserve(dev, "tx_headwb_mem", txq->queue_id,
sizeof(u32) * headwb_size,
TXGBE_ALIGN, socket_id);
if (headwb == NULL) {
- DEBUGOUT("Fail to setup headwb resources: no mem");
- txgbe_tx_queue_release(txq);
- return -ENOMEM;
+ PMD_DRV_LOG(INFO,
+ "Failed to allocate headwb memory for Tx queue %u, change to SP mode",
+ txq->queue_id);
+ goto out;
}
txq->headwb = headwb;
txq->headwb_dma = TMZ_PADDR(headwb);
txq->headwb_mem = (uint32_t *)TMZ_VADDR(headwb);
+ return;
- /* Zero out headwb_mem memory */
- for (i = 0; i < headwb_size; i++)
- txq->headwb_mem[i] = 0;
-
- return 0;
+out:
+ txq->headwb_mem = NULL;
}
int __rte_cold
@@ -2542,6 +2548,7 @@ txgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
txq->offloads = offloads;
txq->ops = &def_txq_ops;
txq->tx_deferred_start = tx_conf->tx_deferred_start;
+ txq->headwb_size = hw->devarg.tx_headwb_size;
#ifdef RTE_LIB_SECURITY
txq->using_ipsec = !!(dev->data->dev_conf.txmode.offloads &
RTE_ETH_TX_OFFLOAD_SECURITY);
@@ -2577,8 +2584,7 @@ txgbe_dev_tx_queue_setup(struct rte_eth_dev *dev,
/* set up scalar TX function as appropriate */
txgbe_set_tx_function(dev, txq);
- if (hw->devarg.tx_headwb)
- err = txgbe_setup_headwb_resources(dev, txq, socket_id);
+ txgbe_setup_headwb_resources(dev, txq, socket_id);
txq->ops->reset(txq);
txq->desc_error = 0;
@@ -4755,15 +4761,14 @@ txgbe_dev_tx_init(struct rte_eth_dev *dev)
wr32(hw, TXGBE_TXRP(txq->reg_idx), 0);
wr32(hw, TXGBE_TXWP(txq->reg_idx), 0);
- if ((hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40) &&
- hw->devarg.tx_headwb) {
+ if (txq->headwb_mem) {
uint32_t txdctl;
wr32(hw, TXGBE_PX_TR_HEAD_ADDRL(txq->reg_idx),
(uint32_t)(txq->headwb_dma & BIT_MASK32));
wr32(hw, TXGBE_PX_TR_HEAD_ADDRH(txq->reg_idx),
(uint32_t)(txq->headwb_dma >> 32));
- if (hw->devarg.tx_headwb_size == 16)
+ if (txq->headwb_size == 16)
txdctl = TXGBE_PX_TR_CFG_HEAD_WB |
TXGBE_PX_TR_CFG_HEAD_WB_64BYTE;
else
diff --git a/drivers/net/txgbe/txgbe_rxtx.h b/drivers/net/txgbe/txgbe_rxtx.h
index 43c818cfbf..5d2e33a8d4 100644
--- a/drivers/net/txgbe/txgbe_rxtx.h
+++ b/drivers/net/txgbe/txgbe_rxtx.h
@@ -416,6 +416,7 @@ struct txgbe_tx_queue {
uint64_t desc_error;
bool resetting;
const struct rte_memzone *headwb;
+ uint16_t headwb_size;
uint64_t headwb_dma;
volatile uint32_t *headwb_mem;
};
diff --git a/drivers/net/txgbe/txgbe_rxtx_vec_common.h b/drivers/net/txgbe/txgbe_rxtx_vec_common.h
index 77d7ff785b..6e561aff30 100644
--- a/drivers/net/txgbe/txgbe_rxtx_vec_common.h
+++ b/drivers/net/txgbe/txgbe_rxtx_vec_common.h
@@ -252,6 +252,13 @@ _txgbe_reset_tx_queue_vec(struct txgbe_tx_queue *txq)
txq->tx_next_dd = (uint16_t)(txq->tx_free_thresh - 1);
txq->tx_tail = 0;
+
+ /* Zero out headwb_mem memory */
+ if (txq->headwb_mem) {
+ for (i = 0; i < txq->headwb_size; i++)
+ txq->headwb_mem[i] = 0;
+ }
+
/*
* Always allow 1 descriptor to be un-allocated to avoid
* a H/W race condition
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 18/21] net/txgbe: fix get EEPROM operation
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
The original I2C access flow in the module information retrieval
process was flawed. Correct the implementation to properly fetch
module info.
Fixes: abf042d32b39 ("net/txgbe: add Amber-Lite 25G/40G NICs")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/base/txgbe_phy.h | 1 +
drivers/net/txgbe/txgbe_ethdev.c | 81 +++++++++++++++++++++++++++---
2 files changed, 76 insertions(+), 6 deletions(-)
diff --git a/drivers/net/txgbe/base/txgbe_phy.h b/drivers/net/txgbe/base/txgbe_phy.h
index 31bdceb35b..a5df015a4d 100644
--- a/drivers/net/txgbe/base/txgbe_phy.h
+++ b/drivers/net/txgbe/base/txgbe_phy.h
@@ -245,6 +245,7 @@
/* EEPROM (dev_addr = 0xA0) */
#define TXGBE_I2C_EEPROM_DEV_ADDR 0xA0
#define TXGBE_SFF_IDENTIFIER 0x00
+#define TXGBE_SFF_8636_STATUS_OFFSET 0x02
#define TXGBE_SFF_IDENTIFIER_SFP 0x03
#define TXGBE_SFF_VENDOR_OUI_BYTE0 0x25
#define TXGBE_SFF_VENDOR_OUI_BYTE1 0x26
diff --git a/drivers/net/txgbe/txgbe_ethdev.c b/drivers/net/txgbe/txgbe_ethdev.c
index c34635c50a..57803fe841 100644
--- a/drivers/net/txgbe/txgbe_ethdev.c
+++ b/drivers/net/txgbe/txgbe_ethdev.c
@@ -5462,23 +5462,92 @@ txgbe_get_module_eeprom(struct rte_eth_dev *dev,
uint8_t databyte = 0xFF;
uint8_t *data = info->data;
uint32_t i = 0;
+ bool is_sfp = false;
+ uint32_t value;
+ uint8_t identifier = 0;
+ uint16_t offset;
+ uint8_t page = 0;
+ bool is_flat_mem = true;
+
+ if (hw->mac.type == txgbe_mac_aml40) {
+ value = rd32(hw, TXGBE_GPIOEXT);
+ if (value & TXGBE_SFP1_MOD_PRST_LS)
+ return -EIO;
+ }
+
+ if (hw->mac.type == txgbe_mac_aml) {
+ value = rd32(hw, TXGBE_GPIOEXT);
+ if (value & TXGBE_SFP1_MOD_ABS_LS)
+ return -EIO;
+ }
if (info->length == 0)
return -EINVAL;
- for (i = info->offset; i < info->offset + info->length; i++) {
- if (i < RTE_ETH_MODULE_SFF_8079_LEN)
- status = hw->phy.read_i2c_eeprom(hw, i, &databyte);
- else
- status = hw->phy.read_i2c_sff8472(hw, i, &databyte);
+ status = hw->mac.acquire_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
+ if (status)
+ return -EBUSY;
+
+ status = hw->phy.read_i2c_eeprom(hw,
+ TXGBE_SFF_IDENTIFIER,
+ &identifier);
+ if (status != 0)
+ goto ERROR_IO;
+ if (identifier == TXGBE_SFF_IDENTIFIER_SFP) {
+ is_sfp = true;
+ } else {
+ uint8_t rdata = 0;
+
+ status = hw->phy.read_i2c_sff8636(hw, 0,
+ TXGBE_SFF_8636_STATUS_OFFSET,
+ &rdata);
if (status != 0)
- return -EIO;
+ goto ERROR_IO;
+ if (rdata & 0x4)
+ is_flat_mem = false;
+ }
+
+ memset(data, 0, info->length);
+
+ for (i = info->offset; i < info->offset + info->length; i++) {
+ databyte = 0;
+
+ if (is_sfp) {
+ if (i < RTE_ETH_MODULE_SFF_8079_LEN)
+ status = hw->phy.read_i2c_eeprom(hw, i,
+ &databyte);
+ else
+ status = hw->phy.read_i2c_sff8472(hw, i,
+ &databyte);
+
+ if (status != 0)
+ goto ERROR_IO;
+ } else {
+ offset = i;
+ page = 0;
+ while (offset >= RTE_ETH_MODULE_SFF_8436_LEN) {
+ offset -= RTE_ETH_MODULE_SFF_8436_LEN / 2;
+ page++;
+ }
+ if (page == 0 || is_flat_mem) {
+ status = hw->phy.read_i2c_sff8636(hw, page, offset,
+ &databyte);
+ if (status != 0)
+ goto ERROR_IO;
+ }
+ }
data[i - info->offset] = databyte;
}
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
return 0;
+
+ERROR_IO:
+ PMD_DRV_LOG(ERR, "I2C IO ERROR.");
+ hw->mac.release_swfw_sync(hw, TXGBE_MNGSEM_SWPHY);
+ return -EIO;
}
bool
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 20/21] net/txgbe: fix to enable Tx desc check
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
Now lib security is enabled by default, and cannot be disabled if the
driver is intended to be used. So Tdm_desc_chk is always unable to enable.
Remove this restriction, and just enable the corresponding queue check.
Fixes: 0eabdfcd4af4 ("net/txgbe: enable Tx descriptor error interrupt")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/txgbe_rxtx.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/net/txgbe/txgbe_rxtx.c b/drivers/net/txgbe/txgbe_rxtx.c
index ed34b3a38c..8b277d3062 100644
--- a/drivers/net/txgbe/txgbe_rxtx.c
+++ b/drivers/net/txgbe/txgbe_rxtx.c
@@ -4761,6 +4761,12 @@ txgbe_dev_tx_init(struct rte_eth_dev *dev)
wr32(hw, TXGBE_TXRP(txq->reg_idx), 0);
wr32(hw, TXGBE_TXWP(txq->reg_idx), 0);
+#ifdef RTE_LIB_SECURITY
+ if (!txq->using_ipsec)
+#endif
+ wr32m(hw, TXGBE_TDM_DESC_CHK(txq->reg_idx / 32),
+ RTE_BIT32(txq->reg_idx % 32), RTE_BIT32(txq->reg_idx % 32));
+
if (txq->headwb_mem) {
uint32_t txdctl;
@@ -4778,11 +4784,6 @@ txgbe_dev_tx_init(struct rte_eth_dev *dev)
}
}
-#ifndef RTE_LIB_SECURITY
- for (i = 0; i < 4; i++)
- wr32(hw, TXGBE_TDM_DESC_CHK(i), 0xFFFFFFFF);
-#endif
-
/* Device configured with multiple TX queues. */
txgbe_dev_mq_tx_configure(dev);
}
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH v6 21/21] net/txgbe: fix temperature track for AML NIC
From: Zaiyu Wang @ 2026-06-16 12:20 UTC (permalink / raw)
To: dev; +Cc: Zaiyu Wang, stable, Jiawen Wu
In-Reply-To: <20260616122030.9688-1-zaiyuwang@trustnetic.com>
Previously, temperature tracking for the amlite NIC was handled by
firmware together with the hardware setup. However, the firmware-based
PHY configuration has proven to be unstable.
Re-add the temperature tracking function directly in the driver and
invoke it periodically to ensure the PHY remains calibrated. According
to the hardware recommendation, the tracking sequence should be run at
least every 100 ms to keep temperature drift within 5 °C. Considering
the software and hardware overhead, a 2-second interval is used as a
practical trade-off that still meets stability requirements while
minimizing performance impact.
The periodic tracking is implemented using a timer in the driver, and
the sequence itself is the same as the one originally performed during
link setup.
Fixes: fb6eb170dfa2 ("net/txgbe: add basic link configuration for Amber-Lite")
Cc: stable@dpdk.org
Signed-off-by: Zaiyu Wang <zaiyuwang@trustnetic.com>
---
drivers/net/txgbe/txgbe_ethdev.c | 44 +++++++++++++++++++++++++++++++-
drivers/net/txgbe/txgbe_ethdev.h | 1 +
2 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/drivers/net/txgbe/txgbe_ethdev.c b/drivers/net/txgbe/txgbe_ethdev.c
index 57803fe841..cb69fcd28f 100644
--- a/drivers/net/txgbe/txgbe_ethdev.c
+++ b/drivers/net/txgbe/txgbe_ethdev.c
@@ -2011,8 +2011,10 @@ txgbe_dev_start(struct rte_eth_dev *dev)
txgbe_filter_restore(dev);
hw->bp_event_interval = 100 * 1000;
- if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40)
+ if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40) {
rte_eal_alarm_set(hw->bp_event_interval, txgbe_dev_e56_check_bp_event, dev);
+ rte_eal_alarm_set(1000 * 1000 * 2, txgbe_dev_check_aml_temp_event, dev);
+ }
if (tm_conf->root && !tm_conf->committed)
PMD_DRV_LOG(WARNING,
@@ -2060,6 +2062,7 @@ txgbe_dev_stop(struct rte_eth_dev *dev)
if (hw->mac.type == txgbe_mac_aml || hw->mac.type == txgbe_mac_aml40) {
rte_eal_alarm_cancel(txgbe_dev_e56_check_bp_event, dev);
+ rte_eal_alarm_cancel(txgbe_dev_check_aml_temp_event, dev);
rte_eal_alarm_cancel(txgbe_dev_setup_link_alarm_handler_aml, hw);
}
@@ -2932,6 +2935,45 @@ txgbe_dev_supported_ptypes_get(struct rte_eth_dev *dev, size_t *no_of_elements)
return NULL;
}
+void txgbe_dev_check_aml_temp_event(void *param)
+{
+ struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
+ struct txgbe_hw *hw = TXGBE_DEV_HW(dev);
+ uint32_t link_speed = 0, val = 0;
+ s32 status = 0;
+ int temp;
+
+ if (hw == NULL)
+ return;
+
+ status = txgbe_e56_get_temp(hw, &temp);
+ if (status)
+ temp = DEFAULT_TEMP;
+
+ if (!(temp - hw->temperature > 4 ||
+ hw->temperature - temp > 4))
+ goto out;
+
+ hw->temperature = temp;
+ val = rd32(hw, TXGBE_PORT);
+ if (val & TXGBE_AMLITE_LED_LINK_40G)
+ link_speed = TXGBE_LINK_SPEED_40GB_FULL;
+ else if (val & TXGBE_AMLITE_LED_LINK_25G)
+ link_speed = TXGBE_LINK_SPEED_25GB_FULL;
+ else
+ link_speed = TXGBE_LINK_SPEED_10GB_FULL;
+
+ rte_spinlock_lock(&hw->phy_lock);
+ if (hw->mac.type == txgbe_mac_aml)
+ txgbe_temp_track_seq(hw, link_speed);
+ else if (hw->mac.type == txgbe_mac_aml40)
+ txgbe_temp_track_seq_40g(hw, link_speed);
+ rte_spinlock_unlock(&hw->phy_lock);
+
+out:
+ rte_eal_alarm_set(1000 * 1000 * 2, txgbe_dev_check_aml_temp_event, dev);
+}
+
void txgbe_dev_e56_check_bp_event(void *param)
{
struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
diff --git a/drivers/net/txgbe/txgbe_ethdev.h b/drivers/net/txgbe/txgbe_ethdev.h
index 309db3bfe9..c32c61d8bf 100644
--- a/drivers/net/txgbe/txgbe_ethdev.h
+++ b/drivers/net/txgbe/txgbe_ethdev.h
@@ -747,5 +747,6 @@ void txgbe_vlan_hw_strip_bitmap_set(struct rte_eth_dev *dev,
uint16_t queue, bool on);
void txgbe_config_vlan_strip_on_all_queues(struct rte_eth_dev *dev,
int mask);
+void txgbe_dev_check_aml_temp_event(void *param);
void txgbe_dev_e56_check_bp_event(void *param);
#endif /* _TXGBE_ETHDEV_H_ */
--
2.21.0.windows.1
^ permalink raw reply related
* [PATCH] drivers: update relaxed ordering policy for mlx5 mkeys
From: Maayan Kashani @ 2026-06-16 12:23 UTC (permalink / raw)
To: dev
Cc: mkashani, rasland, Viacheslav Ovsiienko, Dariusz Sosnowski,
Bing Zhao, Ori Kam, Suanming Mou, Matan Azrad
New adapters expose additional ordering capabilities.
Query the new caps and apply them when creating DevX mkeys via
mlx5_devx_mkey_attr_set_ordering(), which sets PCI relaxed ordering
and RAW=RO when relaxed order is supported.
Use this helper on Windows (still gated by Haswell/Broadwell) and for
Linux wrapped mkeys and crypto/regex/vdpa indirect mkeys when
relaxed order only flag is set.
Linux wrapped mkeys continue to use the legacy Haswell/Broadwell rule for
IBV_ACCESS_RELAXED_ORDERING on the verbs MR.
Upcoming FW will requires setting the correct ordering attributes,
otherwise it fails to create the memory key.
Signed-off-by: Maayan Kashani <mkashani@nvidia.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@nvidia.com>
---
drivers/common/mlx5/linux/mlx5_common_os.c | 6 ++++
drivers/common/mlx5/mlx5_devx_cmds.c | 31 ++++++++++++++++++++
drivers/common/mlx5/mlx5_devx_cmds.h | 9 ++++++
drivers/common/mlx5/mlx5_prm.h | 18 ++++++++++--
drivers/common/mlx5/windows/mlx5_common_os.c | 8 ++---
drivers/crypto/mlx5/mlx5_crypto.c | 4 +++
drivers/regex/mlx5/mlx5_regex_fastpath.c | 5 ++++
drivers/regex/mlx5/mlx5_rxp.c | 4 +++
drivers/vdpa/mlx5/mlx5_vdpa_mem.c | 4 +++
9 files changed, 81 insertions(+), 8 deletions(-)
diff --git a/drivers/common/mlx5/linux/mlx5_common_os.c b/drivers/common/mlx5/linux/mlx5_common_os.c
index e3db6c41245..153709390d9 100644
--- a/drivers/common/mlx5/linux/mlx5_common_os.c
+++ b/drivers/common/mlx5/linux/mlx5_common_os.c
@@ -997,6 +997,7 @@ int
mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
size_t length, struct mlx5_pmd_wrapped_mr *pmd_mr)
{
+ struct mlx5_hca_attr hca_attr = { 0 };
struct mlx5_klm klm = {
.byte_count = length,
.address = (uintptr_t)addr,
@@ -1019,6 +1020,11 @@ mlx5_os_wrapped_mkey_create(void *ctx, void *pd, uint32_t pdn, void *addr,
klm.mkey = ibv_mr->lkey;
mkey_attr.addr = (uintptr_t)addr;
mkey_attr.size = length;
+ if (mlx5_devx_cmd_query_hca_attr(ctx, &hca_attr))
+ return -1;
+ /* If only relaxed order is allowed. */
+ if (hca_attr.mkc_order_write_after_write_ro_only)
+ mlx5_devx_mkey_attr_set_ordering(&mkey_attr, &hca_attr);
mkey = mlx5_devx_cmd_mkey_create(ctx, &mkey_attr);
if (!mkey) {
claim_zero(mlx5_glue->dereg_mr(ibv_mr));
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.c b/drivers/common/mlx5/mlx5_devx_cmds.c
index c4ac2aaceed..140b057ab47 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.c
+++ b/drivers/common/mlx5/mlx5_devx_cmds.c
@@ -331,6 +331,29 @@ mlx5_devx_cmd_flow_counter_query(struct mlx5_devx_obj *dcs,
return 0;
}
+/**
+ * Apply PCI relaxed-ordering and read-after-write ordering to mkey attributes.
+ *
+ * @param[in, out] mkey_attr
+ * Mkey attributes to update.
+ * @param[in] hca_attr
+ * HCA capabilities from mlx5_devx_cmd_query_hca_attr().
+ */
+RTE_EXPORT_INTERNAL_SYMBOL(mlx5_devx_mkey_attr_set_ordering)
+void
+mlx5_devx_mkey_attr_set_ordering(struct mlx5_devx_mkey_attr *mkey_attr,
+ const struct mlx5_hca_attr *hca_attr)
+{
+ if (!mkey_attr || !hca_attr)
+ return;
+
+ mkey_attr->relaxed_ordering_write = hca_attr->relaxed_ordering_write;
+ mkey_attr->relaxed_ordering_read =
+ hca_attr->relaxed_ordering_read || hca_attr->pci_relaxed_ordered_read;
+ if (hca_attr->mkc_order_read_after_write)
+ mkey_attr->read_after_write_ordering = MLX5_MKC_RAW_ORDERING_RO;
+}
+
/**
* Create a new mkey.
*
@@ -417,6 +440,8 @@ mlx5_devx_cmd_mkey_create(void *ctx,
MLX5_SET(mkc, mkc, relaxed_ordering_write,
attr->relaxed_ordering_write);
MLX5_SET(mkc, mkc, relaxed_ordering_read, attr->relaxed_ordering_read);
+ MLX5_SET(mkc, mkc, order_read_after_write,
+ attr->read_after_write_ordering);
MLX5_SET64(mkc, mkc, start_addr, attr->addr);
MLX5_SET64(mkc, mkc, len, attr->size);
MLX5_SET(mkc, mkc, crypto_en, attr->crypto_en);
@@ -1003,6 +1028,12 @@ mlx5_devx_cmd_query_hca_attr(void *ctx,
relaxed_ordering_write);
attr->relaxed_ordering_read = MLX5_GET(cmd_hca_cap, hcattr,
relaxed_ordering_read);
+ attr->pci_relaxed_ordered_read = MLX5_GET(cmd_hca_cap, hcattr,
+ pci_relaxed_ordered_read);
+ attr->mkc_order_read_after_write = MLX5_GET(cmd_hca_cap, hcattr,
+ mkc_order_read_after_write);
+ attr->mkc_order_write_after_write_ro_only = MLX5_GET(cmd_hca_cap, hcattr,
+ mkc_order_write_after_write_ro_only);
attr->access_register_user = MLX5_GET(cmd_hca_cap, hcattr,
access_register_user);
attr->eth_net_offloads = MLX5_GET(cmd_hca_cap, hcattr,
diff --git a/drivers/common/mlx5/mlx5_devx_cmds.h b/drivers/common/mlx5/mlx5_devx_cmds.h
index 82d949972bb..90beb2e9e6c 100644
--- a/drivers/common/mlx5/mlx5_devx_cmds.h
+++ b/drivers/common/mlx5/mlx5_devx_cmds.h
@@ -34,6 +34,7 @@ struct mlx5_devx_mkey_attr {
uint32_t pg_access:1;
uint32_t relaxed_ordering_write:1;
uint32_t relaxed_ordering_read:1;
+ uint32_t read_after_write_ordering:2;
uint32_t umr_en:1;
uint32_t crypto_en:2;
uint32_t set_remote_rw:1;
@@ -237,6 +238,9 @@ struct mlx5_hca_attr {
uint32_t vhca_id:16;
uint32_t relaxed_ordering_write:1;
uint32_t relaxed_ordering_read:1;
+ uint32_t pci_relaxed_ordered_read:1;
+ uint32_t mkc_order_read_after_write:1;
+ uint32_t mkc_order_write_after_write_ro_only:1;
uint32_t access_register_user:1;
uint32_t wqe_index_ignore:1;
uint32_t cross_channel:1;
@@ -748,6 +752,11 @@ int mlx5_devx_cmd_query_hca_attr(void *ctx,
__rte_internal
struct mlx5_devx_obj *mlx5_devx_cmd_mkey_create(void *ctx,
struct mlx5_devx_mkey_attr *attr);
+
+__rte_internal
+void
+mlx5_devx_mkey_attr_set_ordering(struct mlx5_devx_mkey_attr *mkey_attr,
+ const struct mlx5_hca_attr *hca_attr);
__rte_internal
int mlx5_devx_get_out_command_status(void *out);
__rte_internal
diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index 3bb072a7fec..c2810194f8e 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -1463,7 +1463,9 @@ struct mlx5_ifc_mkc_bits {
u8 bsf_octword_size[0x20];
u8 reserved_at_120[0x80];
u8 translations_octword_size[0x20];
- u8 reserved_at_1c0[0x19];
+ u8 reserved_at_1c0[0x16];
+ u8 order_read_after_write[0x2];
+ u8 reserved_at_1d8[0x1];
u8 relaxed_ordering_read[0x1];
u8 reserved_at_1da[0x1];
u8 log_page_size[0x5];
@@ -1478,6 +1480,13 @@ enum {
MLX5_MKEY_CRYPTO_ENABLED = 0x1,
};
+/* MKC read_after_write_ordering field (2-bit, dword 0x38 bits 9:8). */
+enum mlx5_mkc_raw_ordering {
+ MLX5_MKC_RAW_ORDERING_SO = 0x0,
+ MLX5_MKC_RAW_ORDERING_SAO = 0x1,
+ MLX5_MKC_RAW_ORDERING_RO = 0x2,
+};
+
struct mlx5_ifc_create_mkey_out_bits {
u8 status[0x8];
u8 reserved_at_8[0x18];
@@ -1827,7 +1836,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 log_max_mcg[0x8];
u8 reserved_at_320[0x3];
u8 log_max_transport_domain[0x5];
- u8 reserved_at_328[0x3];
+ u8 reserved_at_328[0x2];
+ u8 pci_relaxed_ordered_read[0x1];
u8 log_max_pd[0x5];
u8 reserved_at_330[0xb];
u8 log_max_xrcd[0x5];
@@ -1860,7 +1870,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 ext_stride_num_range[0x1];
u8 reserved_at_3a1[0x2];
u8 log_max_stride_sz_rq[0x5];
- u8 reserved_at_3a8[0x3];
+ u8 mkc_order_read_after_write[0x1];
+ u8 mkc_order_write_after_write_ro_only[0x1];
+ u8 reserved_at_3aa[0x1];
u8 log_min_stride_sz_rq[0x5];
u8 reserved_at_3b0[0x3];
u8 log_max_stride_sz_sq[0x5];
diff --git a/drivers/common/mlx5/windows/mlx5_common_os.c b/drivers/common/mlx5/windows/mlx5_common_os.c
index c790c9a4aeb..bdafb95df98 100644
--- a/drivers/common/mlx5/windows/mlx5_common_os.c
+++ b/drivers/common/mlx5/windows/mlx5_common_os.c
@@ -384,7 +384,7 @@ mlx5_os_reg_mr(void *pd,
{
struct mlx5_devx_mkey_attr mkey_attr;
struct mlx5_pd *mlx5_pd = (struct mlx5_pd *)pd;
- struct mlx5_hca_attr attr;
+ struct mlx5_hca_attr attr = { 0 };
struct mlx5_devx_obj *mkey;
void *obj;
@@ -403,10 +403,8 @@ mlx5_os_reg_mr(void *pd,
mkey_attr.size = length;
mkey_attr.umem_id = ((struct mlx5_devx_umem *)(obj))->umem_id;
mkey_attr.pd = mlx5_pd->pdn;
- if (!mlx5_haswell_broadwell_cpu) {
- mkey_attr.relaxed_ordering_write = attr.relaxed_ordering_write;
- mkey_attr.relaxed_ordering_read = attr.relaxed_ordering_read;
- }
+ if (!mlx5_haswell_broadwell_cpu)
+ mlx5_devx_mkey_attr_set_ordering(&mkey_attr, &attr);
mkey = mlx5_devx_cmd_mkey_create(mlx5_pd->devx_ctx, &mkey_attr);
if (!mkey) {
claim_zero(mlx5_os_umem_dereg(obj));
diff --git a/drivers/crypto/mlx5/mlx5_crypto.c b/drivers/crypto/mlx5/mlx5_crypto.c
index dd0aabb6d75..448dd0c5a4e 100644
--- a/drivers/crypto/mlx5/mlx5_crypto.c
+++ b/drivers/crypto/mlx5/mlx5_crypto.c
@@ -97,7 +97,11 @@ mlx5_crypto_indirect_mkeys_prepare(struct mlx5_crypto_priv *priv,
mlx5_crypto_mkey_update_t update_cb)
{
uint32_t i;
+ struct mlx5_hca_attr *hca_attr = &priv->cdev->config.hca_attr;
+ /* If only relaxed order is allowed. */
+ if (hca_attr->mkc_order_write_after_write_ro_only)
+ mlx5_devx_mkey_attr_set_ordering(attr, hca_attr);
for (i = 0; i < qp->entries_n; i++) {
attr->klm_array = update_cb(priv, qp, i);
qp->mkey[i] = mlx5_devx_cmd_mkey_create(priv->cdev->ctx, attr);
diff --git a/drivers/regex/mlx5/mlx5_regex_fastpath.c b/drivers/regex/mlx5/mlx5_regex_fastpath.c
index 3207bcbc603..55f7411593a 100644
--- a/drivers/regex/mlx5/mlx5_regex_fastpath.c
+++ b/drivers/regex/mlx5/mlx5_regex_fastpath.c
@@ -755,9 +755,14 @@ mlx5_regexdev_setup_fastpath(struct mlx5_regex_priv *priv, uint32_t qp_id)
setup_qps(priv, qp);
if (priv->has_umr) {
+ struct mlx5_hca_attr *hca_attr = &priv->cdev->config.hca_attr;
+
#ifdef HAVE_IBV_FLOW_DV_SUPPORT
attr.pd = priv->cdev->pdn;
#endif
+ /* If only relaxed order is allowed. */
+ if (hca_attr->mkc_order_write_after_write_ro_only)
+ mlx5_devx_mkey_attr_set_ordering(&attr, hca_attr);
for (i = 0; i < qp->nb_desc; i++) {
attr.klm_num = MLX5_REGEX_MAX_KLM_NUM;
attr.klm_array = qp->jobs[i].imkey_array;
diff --git a/drivers/regex/mlx5/mlx5_rxp.c b/drivers/regex/mlx5/mlx5_rxp.c
index dda4a7fdb0b..b865c08b53c 100644
--- a/drivers/regex/mlx5/mlx5_rxp.c
+++ b/drivers/regex/mlx5/mlx5_rxp.c
@@ -54,6 +54,7 @@ rxp_create_mkey(struct mlx5_regex_priv *priv, void *ptr, size_t size,
uint32_t access, struct mlx5_regex_mkey *mkey)
{
struct mlx5_devx_mkey_attr mkey_attr;
+ struct mlx5_hca_attr *hca_attr = &priv->cdev->config.hca_attr;
/* Register the memory. */
mkey->umem = mlx5_glue->devx_umem_reg(priv->cdev->ctx, ptr, size, access);
@@ -72,6 +73,9 @@ rxp_create_mkey(struct mlx5_regex_priv *priv, void *ptr, size_t size,
#ifdef HAVE_IBV_FLOW_DV_SUPPORT
mkey_attr.pd = priv->cdev->pdn;
#endif
+ /* If only relaxed order is allowed. */
+ if (hca_attr->mkc_order_write_after_write_ro_only)
+ mlx5_devx_mkey_attr_set_ordering(&mkey_attr, hca_attr);
mkey->mkey = mlx5_devx_cmd_mkey_create(priv->cdev->ctx, &mkey_attr);
if (!mkey->mkey) {
DRV_LOG(ERR, "Failed to create direct mkey!");
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_mem.c b/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
index 4dfe800b8fc..8c9d169d2a8 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_mem.c
@@ -179,6 +179,7 @@ static int
mlx5_vdpa_create_indirect_mkey(struct mlx5_vdpa_priv *priv)
{
struct mlx5_devx_mkey_attr mkey_attr;
+ struct mlx5_hca_attr *hca_attr = &priv->cdev->config.hca_attr;
struct mlx5_vdpa_query_mr *mrs =
(struct mlx5_vdpa_query_mr *)priv->mrs;
struct mlx5_vdpa_query_mr *entry;
@@ -242,6 +243,9 @@ mlx5_vdpa_create_indirect_mkey(struct mlx5_vdpa_priv *priv)
mkey_attr.pg_access = 0;
mkey_attr.klm_array = klm_array;
mkey_attr.klm_num = klm_index;
+ /* If only relaxed order is allowed. */
+ if (hca_attr->mkc_order_write_after_write_ro_only)
+ mlx5_devx_mkey_attr_set_ordering(&mkey_attr, hca_attr);
entry = &mrs[mem->nregions];
entry->mkey = mlx5_devx_cmd_mkey_create(priv->cdev->ctx, &mkey_attr);
if (!entry->mkey) {
--
2.21.0
^ permalink raw reply related
* [PATCH v10 0/1] net/mana: add device reset support
From: Wei Hu @ 2026-06-16 12:31 UTC (permalink / raw)
To: dev, stephen; +Cc: longli, weh
From: Wei Hu <weh@microsoft.com>
Add support for handling hardware service reset events in the
MANA driver. When the MANA kernel driver receives a hardware
service event, it initiates a device reset and notifies userspace
via IBV_EVENT_DEVICE_FATAL. The MANA PMD handles this by
performing an automatic teardown and recovery sequence.
The driver uses ethdev recovery events (ERR_RECOVERING,
RECOVERY_SUCCESS, RECOVERY_FAILED) to notify upper layers of
the reset lifecycle, and a PCI device removal event callback
to distinguish hot-remove from service reset.
Changes since v9:
- Fixed fd leak in the secondary RESET_EXIT IPC handler: when
the doorbell page was already mapped, the fd from the message
was not closed. Moved close(fd) outside the if/else so it runs
unconditionally whenever an fd is present.
Changes since v8:
- Fixed reset thread resource leak: previously reset_thread_active
was cleared before emitting recovery callbacks, so no join site
would reap the thread. Now the flag stays true throughout the
thread lifetime. mana_join_reset_thread detects the self-join
case (callback calling dev_stop/dev_close from the reset thread)
using rte_thread_equal and calls rte_thread_detach instead of
join, so thread resources are freed on exit. External callers
continue to join normally.
- Fixed lost condvar signal: added a predicate loop around
pthread_cond_timedwait that checks dev_state under
reset_cond_mutex. If mana_pci_remove_event_cb signals before
the reset thread enters the wait, the wakeup is no longer lost.
The PCI remove callback sets dev_state to RESET_FAILED under
the same mutex before signaling.
- Added a lock/unlock barrier on reset_ops_lock in
mana_pci_remove_event_cb to ensure teardown has completed
before emitting the INTR_RMV event.
- Fixed mana_reset_exit_delay return type from uint32_t to int
to match the negative error codes it stores.
- Removed unnecessary else-after-goto in mana_probe_port.
Changes since v7:
- Moved heavy teardown (dev_stop, IPC to secondaries, dev_close,
MR btree free) from mana_reset_enter (EAL interrupt thread)
to mana_reset_thread (control thread). The interrupt handler
now only sets state, drains in-flight bursts, and spawns the
thread. Teardown runs immediately in the control thread before
the recovery timer wait, avoiding blocking the interrupt thread
on multi-second IPC timeouts and ibverbs calls. Each function
now owns its own lock scope with no lock hand-off between
threads.
- Simplified burst_state from encoding device state in bits 1+
to a single blocked flag (bit 1). Only one value was ever
stored, so the multi-state encoding was misleading. Added
MANA_BURST_BLOCKED constant.
- Updated mana.rst to reflect that teardown runs on the control
thread, not the interrupt handler.
Changes since v6:
- Rebased onto latest upstream for-main
- Replaced removed RTE_ETH_DEV_TO_PCI macro with
RTE_CLASS_TO_BUS_DEVICE (upstream commit 4757b8df04
removed the old bus-specific ethdev convenience macros)
Changes since v5:
- Replaced RCU QSBR with per-queue atomic burst_state using a
single-variable CAS design: bit 0 is the in-burst flag, bit 1
is the blocked flag. The data path uses CAS(0→1) to enter
burst and fetch_and(~1) to exit. The reset path uses fetch_or
to set the blocked bit and polls bit 0 to drain in-flight
bursts. This eliminates the two-variable Dekker pattern and the
need for sequential consistency (seq_cst) ordering.
- Removed librte_rcu dependency
- Removed __rte_no_thread_safety_analysis annotations (no longer
needed after mutex conversion)
- Moved ERR_RECOVERING event emission before acquiring
reset_ops_lock and before mana_reset_enter, so upper layers
(e.g. netvsc) can switch data path before mana stops queues.
Emitting outside the lock avoids deadlock if the callback
calls dev_stop or dev_close.
- Replaced MANA_OPS_*_LOCK macros with mana_reset_trylock()
helper function and explicit per-operation wrappers
- Removed unused rte_alarm.h and rte_lock_annotations.h includes
- Added RECOVERY_FAILED event when mana_reset_enter fails
internally, so the application always receives a terminal event
- Added mana_clear_burst_state() helper to clear per-queue
burst_state on failure paths (reset_failed, dev_stop_lock,
dev_close_lock) preventing permanent silent packet drop after
a failed reset
Changes since v4:
- Fixed stale rte_spinlock_unlock call in mana_intr_handler that
was missed during the spinlock-to-mutex conversion, causing a
-Wincompatible-pointer-types warning
Changes since v3:
- Converted reset_ops_lock from rte_spinlock_t to pthread_mutex_t
with PTHREAD_PROCESS_SHARED, since the lock is held across
blocking IB verbs calls and IPC with 5s timeout
- Removed rte_dev_event_callback_unregister retry loop to avoid
deadlock when interrupt thread and reset thread contend
Changes since v2:
- Added per-queue burst_state atomic variable with Dekker-like
synchronization to block data path during reset without RCU
- Replaced rte_alarm with condvar + control thread for reset exit
- Made reset_thread_active atomic with CAS — flag is set by
creator and only cleared by the joiner, not the thread itself
- Fixed second reset crash: removed reset thread join logic from
mana_dev_close (inner function) to avoid corrupting dev_state
when called from mana_reset_enter
- Made reset_thread_active RTE_ATOMIC(bool) with explicit ordering
- Added retry loop for rte_dev_event_callback_unregister on -EAGAIN
- Initialized condvar/mutex with PTHREAD_PROCESS_SHARED since priv
is in hugepage shared memory
- Added re-check of dev_state after lock acquisition in
mana_intr_handler to prevent racing with pci_remove_event_cb
- Replaced (void *)0 with NULL in mp.c
- Added lock ownership comment block at mana_reset_enter
- Documented rte_dev_event_monitor_start() requirement
- Added mana.rst documentation and release note
Changes since v1:
- Removed net/netvsc patch from this series
- Simplified reset exit: mana_reset_exit calls
mana_reset_exit_delay directly instead of spawning a thread
- Added __rte_no_thread_safety_analysis annotations for clang
- Switched to rte_thread_create_internal_control
- Fixed declaration-after-statement style issues
- Removed unnecessary blank lines and stale comments
Wei Hu (1):
net/mana: add device reset support
doc/guides/nics/mana.rst | 40 +
doc/guides/rel_notes/release_26_07.rst | 8 +
drivers/net/mana/mana.c | 1088 ++++++++++++++++++++++--
drivers/net/mana/mana.h | 52 +-
drivers/net/mana/mp.c | 92 +-
drivers/net/mana/mr.c | 6 +-
drivers/net/mana/rx.c | 23 +-
drivers/net/mana/tx.c | 44 +-
8 files changed, 1245 insertions(+), 108 deletions(-)
--
2.34.1
^ permalink raw reply
* [PATCH v10 1/1] net/mana: add device reset support
From: Wei Hu @ 2026-06-16 12:31 UTC (permalink / raw)
To: dev, stephen; +Cc: longli, weh
In-Reply-To: <20260616123158.43583-1-weh@linux.microsoft.com>
From: Wei Hu <weh@microsoft.com>
Add support for handling hardware reset events in the MANA driver.
When the MANA kernel driver receives a hardware service event, it
initiates a device reset and notifies userspace via
IBV_EVENT_DEVICE_FATAL. The DPDK driver handles this by performing
an automatic teardown and recovery sequence.
The interrupt handler sets the device state, blocks new data path
bursts, waits for in-flight bursts to drain using per-queue atomic
flags, and spawns a control thread. The control thread performs
teardown immediately (dev_stop, secondary IPC, dev_close, MR cache
free) before waiting for the hardware recovery timer to fire. This
avoids blocking the EAL interrupt thread on multi-second IPC
timeouts and ibverbs calls. After the recovery delay, the thread
unregisters the interrupt handler, re-probes the PCI device,
reinitializes MR caches, and restarts queues. Each function owns
its own lock scope with no lock hand-off between threads.
Each queue has an atomic burst_state variable where bit 0 is the
in-burst flag and bit 1 is a blocked flag. The data path uses a
single compare-and-swap (0 to 1) to enter a burst, which fails
immediately if the blocked bit is set. The reset path sets the
blocked bit via atomic fetch-or and polls bit 0 to wait for
in-flight bursts to drain. This single-variable design avoids the
need for sequential consistency ordering.
A per-device mutex serializes the reset path with ethdev
operations. The mutex uses PTHREAD_PROCESS_SHARED for multi-process
support and is held across blocking IB verbs calls. A trylock
helper encapsulates the lock acquisition and device state check
for all ethdev operation wrappers. Operations that cannot wait
(configure, queue setup) return -EBUSY during reset, while
dev_stop and dev_close join the reset thread before acquiring
the lock to ensure proper sequencing.
The reset thread keeps reset_thread_active true throughout its
lifetime. mana_join_reset_thread uses rte_thread_equal to detect
the self-join case (when a recovery callback calls dev_stop or
dev_close from the reset thread itself) and calls
rte_thread_detach instead of join, so thread resources are freed
on exit. External callers join normally.
The condvar wait in the reset thread uses a predicate loop that
checks dev_state under reset_cond_mutex, so a PCI remove signal
that arrives before the thread enters the wait is not lost. The
PCI remove callback sets dev_state to RESET_FAILED under the
same mutex before signaling. A lock/unlock barrier on
reset_ops_lock in the PCI remove path ensures teardown has
completed before emitting the INTR_RMV event.
Multi-process support is included: secondary processes unmap and
remap doorbell pages via IPC during the reset enter and exit
phases. The secondary RESET_EXIT handler closes the received fd
unconditionally after processing, even when the doorbell page is
already mapped. Data path functions in both primary and secondary
processes check the device state atomically and return early when
the device is not active.
The driver emits RTE_ETH_EVENT_ERR_RECOVERING before entering the
reset path so that upper layers (e.g. netvsc) can switch their
data path before queues are stopped. The event is emitted outside
the reset lock to avoid deadlock if the callback calls dev_stop or
dev_close. On completion, the driver emits RECOVERY_SUCCESS or
RECOVERY_FAILED after releasing the lock. If a recovery callback
triggers dev_stop or dev_close, the self-join detection in
mana_join_reset_thread detaches the thread to avoid deadlock. If
the enter phase fails internally, RECOVERY_FAILED is sent
immediately so the application receives a terminal event. A PCI
device removal event callback distinguishes hot-remove from
service reset.
Documentation for the device reset feature is added in the MANA
NIC guide and the 26.07 release notes.
Signed-off-by: Wei Hu <weh@microsoft.com>
---
doc/guides/nics/mana.rst | 40 +
doc/guides/rel_notes/release_26_07.rst | 8 +
drivers/net/mana/mana.c | 1088 ++++++++++++++++++++++--
drivers/net/mana/mana.h | 52 +-
drivers/net/mana/mp.c | 92 +-
drivers/net/mana/mr.c | 6 +-
drivers/net/mana/rx.c | 23 +-
drivers/net/mana/tx.c | 44 +-
8 files changed, 1245 insertions(+), 108 deletions(-)
diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
index 0fcab6e2f6..08e345ea61 100644
--- a/doc/guides/nics/mana.rst
+++ b/doc/guides/nics/mana.rst
@@ -71,3 +71,43 @@ The user can specify below argument in devargs.
The default value is not set,
meaning all the NICs will be probed and loaded.
User can specify multiple mac=xx:xx:xx:xx:xx:xx arguments for up to 8 NICs.
+
+Device Reset Support
+--------------------
+
+The MANA PMD supports automatic recovery from hardware service reset events.
+When the MANA kernel driver receives a hardware service event,
+it initiates a device reset and notifies userspace
+via ``IBV_EVENT_DEVICE_FATAL``.
+
+The driver handles this transparently through a two-phase reset flow:
+
+* **Enter phase**: The interrupt handler blocks new data path bursts
+ and waits for all in-flight burst calls to drain
+ using per-queue atomic flags,
+ then spawns a control thread for the remaining work.
+
+* **Teardown and exit phase**: The control thread tears down
+ IB resources and queues, unmaps secondary process doorbell pages,
+ and closes the device. After a delay for hardware recovery,
+ it re-probes the PCI device,
+ reinstalls the interrupt handler,
+ reinitializes resources, and restarts queues.
+
+The driver emits the following ethdev recovery events
+to notify upper layers (e.g. netvsc) of the reset lifecycle:
+
+``RTE_ETH_EVENT_ERR_RECOVERING``
+ Reset has started.
+
+``RTE_ETH_EVENT_RECOVERY_SUCCESS``
+ Device has recovered successfully.
+
+``RTE_ETH_EVENT_RECOVERY_FAILED``
+ Recovery failed.
+
+To distinguish a PCI hot-remove from a service reset,
+the driver registers for PCI device removal events.
+This requires the application to call ``rte_dev_event_monitor_start()``
+for removal events to be delivered
+(e.g. testpmd ``--hot-plug-handling`` option).
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index bd0cec2709..58e8c2422e 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -122,6 +122,14 @@ New Features
Added AGENTS.md file for AI review
and supporting scripts to review patches and documentation.
+* **Added device reset support to the MANA PMD.**
+
+ Added automatic recovery from hardware service reset events
+ in the MANA poll mode driver. The driver uses ethdev recovery events
+ (``RTE_ETH_EVENT_ERR_RECOVERING``, ``RTE_ETH_EVENT_RECOVERY_SUCCESS``,
+ ``RTE_ETH_EVENT_RECOVERY_FAILED``) to notify upper layers of the
+ reset lifecycle.
+
Removed Items
-------------
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 67396cda1f..0b72f711a1 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@ mana_dev_configure(struct rte_eth_dev *dev)
RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
priv->num_queues = dev->data->nb_rx_queues;
+ DRV_LOG(DEBUG, "priv %p, port %u, dev port %u, num_queues: %u",
+ priv, priv->port_id, priv->dev_port, priv->num_queues);
manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -214,8 +216,8 @@ mana_dev_start(struct rte_eth_dev *dev)
DRV_LOG(INFO, "TX/RX queues have started");
- /* Enable datapath for secondary processes */
- mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
+ /* Intentionally ignore errors — secondary may not be running */
+ (void)mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
ret = rxq_intr_enable(priv);
if (ret) {
@@ -242,26 +244,33 @@ mana_dev_stop(struct rte_eth_dev *dev)
{
int ret;
struct mana_priv *priv = dev->data->dev_private;
-
- rxq_intr_disable(priv);
+ enum mana_device_state state;
+
+ state = rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire);
+ if (state == MANA_DEV_ACTIVE ||
+ state == MANA_DEV_RESET_FAILED) {
+ rxq_intr_disable(priv);
+ DRV_LOG(DEBUG, "rxq_intr_disable called");
+ }
dev->tx_pkt_burst = mana_tx_burst_removed;
dev->rx_pkt_burst = mana_rx_burst_removed;
- /* Stop datapath on secondary processes */
- mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
+ /* Intentionally ignore errors — secondary may not be running */
+ (void)mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
rte_wmb();
ret = mana_stop_tx_queues(dev);
if (ret) {
- DRV_LOG(ERR, "failed to stop tx queues");
+ DRV_LOG(ERR, "failed to stop tx queues, ret %d", ret);
return ret;
}
ret = mana_stop_rx_queues(dev);
if (ret) {
- DRV_LOG(ERR, "failed to stop tx queues");
+ DRV_LOG(ERR, "failed to stop rx queues, ret %d", ret);
return ret;
}
@@ -275,36 +284,66 @@ mana_dev_close(struct rte_eth_dev *dev)
{
struct mana_priv *priv = dev->data->dev_private;
int ret;
+ enum mana_device_state state;
+ DRV_LOG(DEBUG, "Free MR for priv %p", priv);
mana_remove_all_mr(priv);
- ret = mana_intr_uninstall(priv);
- if (ret)
- return ret;
+ state = rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire);
+ if (state == MANA_DEV_ACTIVE ||
+ state == MANA_DEV_RESET_FAILED) {
+ ret = mana_intr_uninstall(priv);
+ if (ret)
+ return ret;
+ }
if (priv->ib_parent_pd) {
- int err = ibv_dealloc_pd(priv->ib_parent_pd);
- if (err)
- DRV_LOG(ERR, "Failed to deallocate parent PD: %d", err);
+ ret = ibv_dealloc_pd(priv->ib_parent_pd);
+ if (ret)
+ DRV_LOG(ERR,
+ "Failed to deallocate parent PD: %d", ret);
priv->ib_parent_pd = NULL;
}
if (priv->ib_pd) {
- int err = ibv_dealloc_pd(priv->ib_pd);
- if (err)
- DRV_LOG(ERR, "Failed to deallocate PD: %d", err);
+ ret = ibv_dealloc_pd(priv->ib_pd);
+ if (ret)
+ DRV_LOG(ERR, "Failed to deallocate PD: %d", ret);
priv->ib_pd = NULL;
}
- ret = ibv_close_device(priv->ib_ctx);
- if (ret) {
- ret = errno;
- return ret;
+ state = rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire);
+ if (state == MANA_DEV_ACTIVE ||
+ state == MANA_DEV_RESET_FAILED) {
+ if (priv->ib_ctx) {
+ ret = ibv_close_device(priv->ib_ctx);
+ if (ret) {
+ ret = errno;
+ return ret;
+ }
+ priv->ib_ctx = NULL;
+ }
}
return 0;
}
+/*
+ * Called from mana_pci_remove to free resources allocated
+ * during probe that are not freed by dev_close.
+ */
+static void
+mana_dev_free_resources(struct rte_eth_dev *dev)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+
+ pthread_mutex_destroy(&priv->reset_ops_lock);
+ pthread_mutex_destroy(&priv->reset_cond_mutex);
+ pthread_cond_destroy(&priv->reset_cond);
+}
+
static int
mana_dev_info_get(struct rte_eth_dev *dev,
struct rte_eth_dev_info *dev_info)
@@ -391,6 +430,39 @@ mana_dev_info_get(struct rte_eth_dev *dev,
return 0;
}
+/*
+ * Try to acquire the reset lock and verify the device is active.
+ * Returns 0 with lock held on success, or -EBUSY if the lock
+ * could not be acquired or the device is not in ACTIVE state.
+ */
+static int
+mana_reset_trylock(struct mana_priv *priv)
+{
+ if (pthread_mutex_trylock(&priv->reset_ops_lock))
+ return -EBUSY;
+
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return -EBUSY;
+ }
+ return 0;
+}
+
+static int
+mana_dev_info_get_lock(struct rte_eth_dev *dev,
+ struct rte_eth_dev_info *dev_info)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_dev_info_get(dev, dev_info);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
static void
mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
struct rte_eth_txq_info *qinfo)
@@ -552,6 +624,22 @@ mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
return ret;
}
+static int
+mana_dev_tx_queue_setup_lock(struct rte_eth_dev *dev, uint16_t queue_idx,
+ uint16_t nb_desc, unsigned int socket_id,
+ const struct rte_eth_txconf *tx_conf)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_dev_tx_queue_setup(dev, queue_idx,
+ nb_desc, socket_id, tx_conf);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
static void
mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
@@ -629,6 +717,23 @@ mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
return ret;
}
+static int
+mana_dev_rx_queue_setup_lock(struct rte_eth_dev *dev, uint16_t queue_idx,
+ uint16_t nb_desc, unsigned int socket_id,
+ const struct rte_eth_rxconf *rx_conf __rte_unused,
+ struct rte_mempool *mp)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_dev_rx_queue_setup(dev, queue_idx, nb_desc,
+ socket_id, rx_conf, mp);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
static void
mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
{
@@ -820,33 +925,267 @@ mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
return mana_ifreq(priv, SIOCSIFMTU, &request);
}
+static int
+mana_dev_configure_lock(struct rte_eth_dev *dev)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_dev_configure(dev);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static int
+mana_dev_start_lock(struct rte_eth_dev *dev)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_dev_start(dev);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+/*
+ * Join the reset thread if it is active. Uses CAS on
+ * reset_thread_active to ensure only one caller joins.
+ * If called from the reset thread itself (e.g. via a recovery
+ * event callback that calls dev_stop/dev_close), detach instead
+ * of joining to avoid deadlock and let the thread self-free.
+ */
+static void
+mana_join_reset_thread(struct mana_priv *priv)
+{
+ bool expected = true;
+
+ if (rte_atomic_compare_exchange_strong_explicit(
+ &priv->reset_thread_active, &expected, false,
+ rte_memory_order_acq_rel,
+ rte_memory_order_acquire)) {
+ if (rte_thread_equal(rte_thread_self(),
+ priv->reset_thread)) {
+ /* Self case: detach so resources are freed on
+ * thread exit. Don't modify dev_state — the
+ * caller (dev_stop_lock/dev_close_lock) handles
+ * state transitions.
+ */
+ rte_thread_detach(priv->reset_thread);
+ return;
+ }
+
+ pthread_mutex_lock(&priv->reset_cond_mutex);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_ACTIVE, rte_memory_order_release);
+ pthread_cond_signal(&priv->reset_cond);
+ pthread_mutex_unlock(&priv->reset_cond_mutex);
+ rte_thread_join(priv->reset_thread, NULL);
+ }
+}
+
+/*
+ * Clear per-queue burst_state so the data path CAS can succeed again.
+ * Must be called under reset_ops_lock when transitioning back to ACTIVE
+ * after a failed or aborted reset.
+ */
+static void
+mana_clear_burst_state(struct rte_eth_dev *dev)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int i;
+
+ for (i = 0; i < priv->num_queues; i++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+ struct mana_txq *txq = dev->data->tx_queues[i];
+
+ if (rxq)
+ rte_atomic_store_explicit(&rxq->burst_state, 0,
+ rte_memory_order_release);
+ if (txq)
+ rte_atomic_store_explicit(&txq->burst_state, 0,
+ rte_memory_order_release);
+ }
+}
+
+/*
+ * Custom lock wrappers for dev_stop and dev_close.
+ * These join any active reset thread and use a blocking lock (not
+ * trylock) so they wait for any in-progress reset processing to
+ * finish, rather than returning -EBUSY. When the device is not in
+ * MANA_DEV_ACTIVE state, they transition state to MANA_DEV_ACTIVE.
+ */
+static int
+mana_dev_stop_lock(struct rte_eth_dev *dev)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ mana_join_reset_thread(priv);
+
+ pthread_mutex_lock(&priv->reset_ops_lock);
+
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+ mana_clear_burst_state(dev);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_ACTIVE, rte_memory_order_release);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return 0;
+ }
+
+ ret = mana_dev_stop(dev);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static int
+mana_dev_close_lock(struct rte_eth_dev *dev)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ mana_join_reset_thread(priv);
+
+ pthread_mutex_lock(&priv->reset_ops_lock);
+
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) != MANA_DEV_ACTIVE) {
+ mana_clear_burst_state(dev);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_ACTIVE, rte_memory_order_release);
+ }
+
+ ret = mana_dev_close(dev);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static int
+mana_rss_hash_update_lock(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_rss_hash_update(dev, rss_conf);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static int
+mana_rss_hash_conf_get_lock(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_rss_hash_conf_get(dev, rss_conf);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static void
+mana_dev_tx_queue_release_lock(struct rte_eth_dev *dev, uint16_t qid)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+
+ if (mana_reset_trylock(priv)) {
+ DRV_LOG(ERR, "Device reset in progress, "
+ "mana_dev_tx_queue_release not called");
+ return;
+ }
+ mana_dev_tx_queue_release(dev, qid);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static void
+mana_dev_rx_queue_release_lock(struct rte_eth_dev *dev, uint16_t qid)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+
+ if (mana_reset_trylock(priv)) {
+ DRV_LOG(ERR, "Device reset in progress, "
+ "mana_dev_rx_queue_release not called");
+ return;
+ }
+ mana_dev_rx_queue_release(dev, qid);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static int
+mana_rx_intr_enable_lock(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_rx_intr_enable(dev, rx_queue_id);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static int
+mana_rx_intr_disable_lock(struct rte_eth_dev *dev, uint16_t rx_queue_id)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_rx_intr_disable(dev, rx_queue_id);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
+static int
+mana_mtu_set_lock(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ if (mana_reset_trylock(priv))
+ return -EBUSY;
+ ret = mana_mtu_set(dev, mtu);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+}
+
static const struct eth_dev_ops mana_dev_ops = {
- .dev_configure = mana_dev_configure,
- .dev_start = mana_dev_start,
- .dev_stop = mana_dev_stop,
- .dev_close = mana_dev_close,
- .dev_infos_get = mana_dev_info_get,
+ .dev_configure = mana_dev_configure_lock,
+ .dev_start = mana_dev_start_lock,
+ .dev_stop = mana_dev_stop_lock,
+ .dev_close = mana_dev_close_lock,
+ .dev_infos_get = mana_dev_info_get_lock,
.txq_info_get = mana_dev_tx_queue_info,
.rxq_info_get = mana_dev_rx_queue_info,
.dev_supported_ptypes_get = mana_supported_ptypes,
- .rss_hash_update = mana_rss_hash_update,
- .rss_hash_conf_get = mana_rss_hash_conf_get,
- .tx_queue_setup = mana_dev_tx_queue_setup,
- .tx_queue_release = mana_dev_tx_queue_release,
- .rx_queue_setup = mana_dev_rx_queue_setup,
- .rx_queue_release = mana_dev_rx_queue_release,
- .rx_queue_intr_enable = mana_rx_intr_enable,
- .rx_queue_intr_disable = mana_rx_intr_disable,
+ .rss_hash_update = mana_rss_hash_update_lock,
+ .rss_hash_conf_get = mana_rss_hash_conf_get_lock,
+ .tx_queue_setup = mana_dev_tx_queue_setup_lock,
+ .tx_queue_release = mana_dev_tx_queue_release_lock,
+ .rx_queue_setup = mana_dev_rx_queue_setup_lock,
+ .rx_queue_release = mana_dev_rx_queue_release_lock,
+ .rx_queue_intr_enable = mana_rx_intr_enable_lock,
+ .rx_queue_intr_disable = mana_rx_intr_disable_lock,
.link_update = mana_dev_link_update,
.stats_get = mana_dev_stats_get,
.stats_reset = mana_dev_stats_reset,
- .mtu_set = mana_mtu_set,
+ .mtu_set = mana_mtu_set_lock,
};
static const struct eth_dev_ops mana_dev_secondary_ops = {
.stats_get = mana_dev_stats_get,
.stats_reset = mana_dev_stats_reset,
- .dev_infos_get = mana_dev_info_get,
+ .dev_infos_get = mana_dev_info_get_lock,
};
uint16_t
@@ -1031,28 +1370,516 @@ mana_ibv_device_to_pci_addr(const struct ibv_device *device,
return 0;
}
+static int mana_pci_probe(struct rte_pci_driver *pci_drv,
+ struct rte_pci_device *pci_dev);
+static void mana_intr_handler(void *arg);
+static void mana_reset_exit(struct mana_priv *priv);
+
+/* Delay before initiating reset exit after reset enter completes */
+#define MANA_RESET_TIMER_US (15 * 1000000ULL) /* 15 seconds */
+
/*
- * Interrupt handler from IB layer to notify this device is being removed.
+ * Callback for PCI device removal events from EAL.
+ * If the device is in reset (RESET_EXIT state), this means the PCI
+ * device was hot-removed rather than a service reset. Wake the reset
+ * thread via condvar and notify netvsc via RTE_ETH_EVENT_INTR_RMV.
+ */
+static void
+mana_pci_remove_event_cb(const char *device_name,
+ enum rte_dev_event_type event, void *cb_arg)
+{
+ struct mana_priv *priv = cb_arg;
+ struct rte_eth_dev *dev;
+
+ if (event != RTE_DEV_EVENT_REMOVE)
+ return;
+
+ DRV_LOG(INFO, "PCI device %s removed", device_name);
+
+ /* Wake the reset thread immediately */
+ pthread_mutex_lock(&priv->reset_cond_mutex);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_RESET_FAILED, rte_memory_order_release);
+ pthread_cond_signal(&priv->reset_cond);
+ pthread_mutex_unlock(&priv->reset_cond_mutex);
+
+ /* Wait for the reset thread to finish teardown and release
+ * reset_ops_lock before emitting INTR_RMV to the application.
+ */
+ pthread_mutex_lock(&priv->reset_ops_lock);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+
+ dev = &rte_eth_devices[priv->port_id];
+ DRV_LOG(INFO, "Sending RTE_ETH_EVENT_INTR_RMV for port %u",
+ priv->port_id);
+ rte_eth_dev_callback_process(dev,
+ RTE_ETH_EVENT_INTR_RMV, NULL);
+}
+
+/*
+ * Reset thread: performs teardown immediately, waits for the
+ * recovery timer, then re-probes and restarts the device.
+ * Runs on a control thread so it can call blocking IPC, ibv
+ * teardown, and rte_intr_callback_unregister (which all must
+ * not run on the EAL interrupt thread).
+ */
+static uint32_t
+mana_reset_thread(void *arg)
+{
+ struct mana_priv *priv = (struct mana_priv *)arg;
+ struct rte_eth_dev *dev = &rte_eth_devices[priv->port_id];
+ struct timespec ts;
+ int ret;
+ int i;
+
+ DRV_LOG(INFO, "Reset thread started");
+
+ pthread_mutex_lock(&priv->reset_ops_lock);
+
+ /* Teardown: stop data path, unmap secondary doorbells, close device,
+ * free MR caches. Must happen immediately — hardware may be gone.
+ */
+ ret = mana_dev_stop(dev);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to stop mana dev ret %d", ret);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_RESET_FAILED, rte_memory_order_release);
+ goto reset_failed;
+ }
+
+ ret = mana_mp_req_on_rxtx(dev, MANA_MP_REQ_RESET_ENTER);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to reset secondary processes ret = %d",
+ ret);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_RESET_FAILED, rte_memory_order_release);
+ goto reset_failed;
+ }
+
+ ret = mana_dev_close(dev);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to close mana dev ret %d", ret);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_RESET_FAILED, rte_memory_order_release);
+ goto reset_failed;
+ }
+
+ for (i = 0; i < priv->num_queues; i++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+ struct mana_txq *txq = dev->data->tx_queues[i];
+
+ DRV_LOG(DEBUG, "Free MR for priv = %p, rxq %u, txq %u",
+ priv, rxq->rxq_idx, txq->txq_idx);
+ mana_mr_btree_free(&rxq->mr_btree);
+ mana_mr_btree_free(&txq->mr_btree);
+ }
+
+ DRV_LOG(DEBUG, "Teardown complete");
+
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_EXIT,
+ rte_memory_order_release);
+
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+
+ /* Wait for the recovery timer before re-probing.
+ * Check dev_state under reset_cond_mutex before waiting:
+ * if mana_pci_remove_event_cb already set RESET_FAILED
+ * (under the same mutex), we skip the wait entirely.
+ * This avoids losing a condvar signal that arrived before
+ * we entered the wait.
+ */
+ DRV_LOG(INFO, "Waiting %us for hardware recovery",
+ (unsigned int)(MANA_RESET_TIMER_US / 1000000));
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ ts.tv_sec += MANA_RESET_TIMER_US / 1000000;
+
+ pthread_mutex_lock(&priv->reset_cond_mutex);
+ while (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) == MANA_DEV_RESET_EXIT) {
+ if (pthread_cond_timedwait(&priv->reset_cond,
+ &priv->reset_cond_mutex, &ts))
+ break; /* timeout */
+ }
+ pthread_mutex_unlock(&priv->reset_cond_mutex);
+
+ pthread_mutex_lock(&priv->reset_ops_lock);
+
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) != MANA_DEV_RESET_EXIT) {
+ DRV_LOG(INFO, "Reset thread: dev_state=%d, skipping exit",
+ (int)rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire));
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return 0;
+ }
+
+ DRV_LOG(INFO, "Reset thread: initiating reset exit");
+ mana_reset_exit(priv);
+ /* Lock is released by mana_reset_exit_delay.
+ * reset_thread_active remains true — the joiner
+ * (mana_join_reset_thread) will either join or detach
+ * (if called from this thread's own callback).
+ */
+ return 0;
+
+reset_failed:
+ mana_clear_burst_state(dev);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+
+ DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+ priv->port_id);
+ rte_eth_dev_callback_process(dev,
+ RTE_ETH_EVENT_RECOVERY_FAILED, NULL);
+ return 0;
+}
+
+static void
+mana_reset_enter(struct mana_priv *priv)
+{
+ int ret;
+ int i;
+ struct rte_eth_dev *dev = &rte_eth_devices[priv->port_id];
+
+ /*
+ * Lock ownership: mana_intr_handler acquires reset_ops_lock,
+ * mana_reset_enter sets state/drains/spawns thread and releases it.
+ * The reset thread independently acquires/releases the lock for
+ * teardown and for the exit (re-probe) phase.
+ */
+
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_ENTER,
+ rte_memory_order_release);
+
+ DRV_LOG(DEBUG, "Entering into device reset state");
+ DRV_LOG(DEBUG, "Resetting dev = %p, priv = %p", dev, priv);
+
+ /* Set the blocked bit on each queue's burst_state so new bursts
+ * are rejected, then wait for any in-flight burst (bit 0) to finish.
+ */
+ for (i = 0; i < priv->num_queues; i++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+ struct mana_txq *txq = dev->data->tx_queues[i];
+
+ if (rxq)
+ rte_atomic_fetch_or_explicit(&rxq->burst_state,
+ MANA_BURST_BLOCKED,
+ rte_memory_order_release);
+ if (txq)
+ rte_atomic_fetch_or_explicit(&txq->burst_state,
+ MANA_BURST_BLOCKED,
+ rte_memory_order_release);
+ }
+
+ /* Wait for all in-flight burst calls to finish (bit 0 to clear) */
+ for (i = 0; i < priv->num_queues; i++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+ struct mana_txq *txq = dev->data->tx_queues[i];
+
+ if (rxq)
+ while (rte_atomic_load_explicit(&rxq->burst_state,
+ rte_memory_order_acquire) & 1)
+ rte_pause();
+ if (txq)
+ while (rte_atomic_load_explicit(&txq->burst_state,
+ rte_memory_order_acquire) & 1)
+ rte_pause();
+ }
+
+ DRV_LOG(DEBUG, "All data path threads drained");
+
+ /* Join previous reset thread if it completed but was not joined.
+ * Use CAS to avoid double-join if another path joined first.
+ * Don't use mana_join_reset_thread() here — we are already in
+ * RESET_ENTER state and must not change dev_state to ACTIVE.
+ */
+ {
+ bool expected = true;
+
+ if (rte_atomic_compare_exchange_strong_explicit(
+ &priv->reset_thread_active, &expected, false,
+ rte_memory_order_acq_rel,
+ rte_memory_order_acquire))
+ rte_thread_join(priv->reset_thread, NULL);
+ }
+
+ ret = rte_thread_create_internal_control(&priv->reset_thread,
+ "mana-reset",
+ mana_reset_thread, priv);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to create reset thread ret %d", ret);
+ rte_atomic_store_explicit(&priv->dev_state,
+ MANA_DEV_RESET_FAILED,
+ rte_memory_order_release);
+ goto reset_failed;
+ }
+ rte_atomic_store_explicit(&priv->reset_thread_active,
+ true, rte_memory_order_release);
+
+ DRV_LOG(DEBUG, "Reset thread started");
+
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return;
+
+reset_failed:
+ mana_clear_burst_state(dev);
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+}
+
+static int
+mana_reset_exit_delay(void *arg)
+{
+ struct mana_priv *priv = (struct mana_priv *)arg;
+ int ret = 0;
+ int i;
+ struct rte_eth_dev *dev;
+ struct rte_pci_device *pci_dev;
+
+ DRV_LOG(DEBUG, "Delayed mana device reset complete processing");
+
+ /* If the app called dev_stop/dev_close during the timer window,
+ * state is no longer RESET_EXIT. Nothing to do.
+ */
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) != MANA_DEV_RESET_EXIT) {
+ DRV_LOG(DEBUG, "State is not RESET_EXIT, skipping");
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+ return ret;
+ }
+
+ dev = &rte_eth_devices[priv->port_id];
+ pci_dev = RTE_CLASS_TO_BUS_DEVICE(dev, *pci_dev);
+
+ DRV_LOG(DEBUG, "Resetting dev = %p, priv = %p", dev, priv);
+
+ ret = ibv_close_device(priv->ib_ctx);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to close ibv device %d", ret);
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+ rte_memory_order_release);
+ goto out;
+ }
+ priv->ib_ctx = NULL;
+
+ ret = mana_pci_probe(NULL, pci_dev);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to probe mana pci dev ret %d", ret);
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+ rte_memory_order_release);
+ goto out;
+ }
+
+ /*
+ * Init the local MR caches.
+ */
+ for (i = 0; i < priv->num_queues; i++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+ struct mana_txq *txq = dev->data->tx_queues[i];
+
+ ret = mana_mr_btree_init(&rxq->mr_btree,
+ MANA_MR_BTREE_PER_QUEUE_N,
+ rxq->socket);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to init RXQ %d MR btree "
+ "on socket %u, ret %d", i, rxq->socket, ret);
+ goto mr_init_failed_rxq;
+ }
+
+ ret = mana_mr_btree_init(&txq->mr_btree,
+ MANA_MR_BTREE_PER_QUEUE_N,
+ txq->socket);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to init TXQ %d MR btree "
+ "on socket %u, ret %d", i, txq->socket, ret);
+ goto mr_init_failed_txq;
+ }
+ }
+ DRV_LOG(DEBUG, "priv %p, num_queues %u", priv, priv->num_queues);
+
+ /* Start secondaries */
+ ret = mana_mp_req_on_rxtx(dev, MANA_MP_REQ_RESET_EXIT);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to start secondary processes ret = %d",
+ ret);
+ goto mr_init_failed_all;
+ }
+
+ ret = mana_dev_start(dev);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to start mana dev ret %d", ret);
+ goto mr_init_failed_all;
+ }
+
+ /* Clear per-queue burst_state before marking device active so
+ * data path CAS can succeed again.
+ */
+ for (i = 0; i < priv->num_queues; i++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[i];
+ struct mana_txq *txq = dev->data->tx_queues[i];
+
+ if (rxq)
+ rte_atomic_store_explicit(&rxq->burst_state, 0,
+ rte_memory_order_release);
+ if (txq)
+ rte_atomic_store_explicit(&txq->burst_state, 0,
+ rte_memory_order_release);
+ }
+
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_ACTIVE,
+ rte_memory_order_release);
+
+ DRV_LOG(DEBUG, "Exiting the reset complete processing");
+ goto out;
+
+mr_init_failed_all:
+ i = priv->num_queues;
+ goto mr_init_failed_rxq;
+
+mr_init_failed_txq:
+ /* RXQ btree at index i was initialized, free it */
+ mana_mr_btree_free(&((struct mana_rxq *)
+ dev->data->rx_queues[i])->mr_btree);
+
+mr_init_failed_rxq:
+ /* Free all fully initialized btrees for indices < i */
+ for (int j = 0; j < i; j++) {
+ struct mana_rxq *rxq = dev->data->rx_queues[j];
+ struct mana_txq *txq = dev->data->tx_queues[j];
+
+ mana_mr_btree_free(&rxq->mr_btree);
+ mana_mr_btree_free(&txq->mr_btree);
+ }
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_RESET_FAILED,
+ rte_memory_order_release);
+
+out:
+ pthread_mutex_unlock(&priv->reset_ops_lock);
+
+ if (!ret) {
+ DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_SUCCESS for port %u",
+ priv->port_id);
+ rte_eth_dev_callback_process(dev,
+ RTE_ETH_EVENT_RECOVERY_SUCCESS, NULL);
+ } else {
+ DRV_LOG(INFO, "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+ priv->port_id);
+ rte_eth_dev_callback_process(dev,
+ RTE_ETH_EVENT_RECOVERY_FAILED, NULL);
+ }
+ return ret;
+}
+
+static void
+mana_reset_exit(struct mana_priv *priv)
+{
+ int ret;
+
+ if (!priv) {
+ DRV_LOG(ERR, "Private structure invalid");
+ return;
+ }
+ DRV_LOG(DEBUG, "Entering into device reset complete processing");
+
+ rxq_intr_disable(priv);
+
+ /* Unregister the interrupt handler. Since mana_reset_exit is always
+ * called from mana_reset_thread (a non-interrupt thread), the
+ * interrupt source is inactive and rte_intr_callback_unregister
+ * succeeds directly.
+ */
+ if (priv->intr_handle) {
+ ret = rte_intr_callback_unregister(priv->intr_handle,
+ mana_intr_handler, priv);
+ if (ret < 0)
+ DRV_LOG(ERR, "Failed to unregister intr callback ret %d",
+ ret);
+ else
+ DRV_LOG(DEBUG, "%d intr callback(s) removed", ret);
+
+ rte_intr_instance_free(priv->intr_handle);
+ priv->intr_handle = NULL;
+ }
+
+ /* Proceed directly to reset exit delay (re-probe and restart).
+ * No need for a separate thread - we are already on
+ * mana_reset_thread which is a non-interrupt control thread.
+ */
+ mana_reset_exit_delay(priv);
+}
+
+/*
+ * Interrupt handler from IB layer to notify this device is
+ * being removed or reset.
*/
static void
mana_intr_handler(void *arg)
{
struct mana_priv *priv = arg;
struct ibv_context *ctx = priv->ib_ctx;
- struct ibv_async_event event;
+ struct ibv_async_event event = { 0 };
+ struct rte_eth_dev *dev;
/* Read and ack all messages from IB device */
while (true) {
if (ibv_get_async_event(ctx, &event))
break;
- if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
- struct rte_eth_dev *dev;
-
- dev = &rte_eth_devices[priv->port_id];
- if (dev->data->dev_conf.intr_conf.rmv)
+ switch (event.event_type) {
+ case IBV_EVENT_DEVICE_FATAL:
+ DRV_LOG(INFO, "IBV_EVENT_DEVICE_FATAL received, dev_state=%d",
+ (int)rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire));
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) == MANA_DEV_ACTIVE) {
+ /* Notify upper layers (e.g. netvsc) before
+ * acquiring the lock so they can switch data
+ * path before mana stops queues. Emitting
+ * outside the lock avoids deadlock if the
+ * callback calls dev_stop/dev_close.
+ */
+ dev = &rte_eth_devices[priv->port_id];
+ DRV_LOG(INFO,
+ "Sending RTE_ETH_EVENT_ERR_RECOVERING for port %u",
+ priv->port_id);
rte_eth_dev_callback_process(dev,
- RTE_ETH_EVENT_INTR_RMV, NULL);
+ RTE_ETH_EVENT_ERR_RECOVERING,
+ NULL);
+
+ pthread_mutex_lock(&priv->reset_ops_lock);
+
+ /* Re-check after lock to avoid racing with
+ * mana_pci_remove_event_cb which may have
+ * set RESET_FAILED while we waited.
+ */
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) !=
+ MANA_DEV_ACTIVE) {
+ pthread_mutex_unlock(
+ &priv->reset_ops_lock);
+ break;
+ }
+
+ mana_reset_enter(priv);
+
+ if (rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire) ==
+ MANA_DEV_RESET_FAILED) {
+ DRV_LOG(INFO,
+ "Sending RTE_ETH_EVENT_RECOVERY_FAILED for port %u",
+ priv->port_id);
+ rte_eth_dev_callback_process(dev,
+ RTE_ETH_EVENT_RECOVERY_FAILED,
+ NULL);
+ }
+ } else {
+ DRV_LOG(ERR, "Already in reset handling, dev_state=%d",
+ (int)rte_atomic_load_explicit(&priv->dev_state,
+ rte_memory_order_acquire));
+ }
+ break;
+
+ default:
+ break;
}
ibv_ack_async_event(&event);
@@ -1063,6 +1890,23 @@ static int
mana_intr_uninstall(struct mana_priv *priv)
{
int ret;
+ struct rte_eth_dev *dev;
+
+ if (!priv->intr_handle)
+ return 0;
+
+ /* Unregister PCI device removal event callback.
+ * Do not retry on -EAGAIN to avoid deadlock: the callback
+ * may be blocked waiting for reset_ops_lock which we hold.
+ */
+ dev = &rte_eth_devices[priv->port_id];
+ if (dev->device) {
+ ret = rte_dev_event_callback_unregister(dev->device->name,
+ mana_pci_remove_event_cb, priv);
+ if (ret < 0 && ret != -ENOENT)
+ DRV_LOG(WARNING, "Failed to unregister PCI remove cb ret %d",
+ ret);
+ }
ret = rte_intr_callback_unregister(priv->intr_handle,
mana_intr_handler, priv);
@@ -1072,6 +1916,7 @@ mana_intr_uninstall(struct mana_priv *priv)
}
rte_intr_instance_free(priv->intr_handle);
+ priv->intr_handle = NULL;
return 0;
}
@@ -1127,6 +1972,16 @@ mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
goto free_intr;
}
+ /* Register for PCI device removal events to distinguish
+ * PCI hot-remove from service reset. This requires the
+ * application to call rte_dev_event_monitor_start() for
+ * events to be delivered (e.g. testpmd --hot-plug-handling).
+ */
+ ret = rte_dev_event_callback_register(eth_dev->device->name,
+ mana_pci_remove_event_cb, priv);
+ if (ret)
+ DRV_LOG(WARNING, "Failed to register PCI remove event callback");
+
eth_dev->intr_handle = priv->intr_handle;
return 0;
@@ -1156,7 +2011,7 @@ mana_proc_priv_init(struct rte_eth_dev *dev)
/*
* Map the doorbell page for the secondary process through IB device handle.
*/
-static int
+int
mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
{
struct mana_process_priv *priv = eth_dev->process_private;
@@ -1294,17 +2149,29 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
char name[RTE_ETH_NAME_MAX_LEN];
int ret;
struct ibv_context *ctx = NULL;
+ bool is_reset = false;
+ pthread_mutexattr_t mattr;
+ pthread_condattr_t cattr;
rte_ether_format_addr(address, sizeof(address), addr);
- DRV_LOG(INFO, "device located port %u address %s", port, address);
- priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
- SOCKET_ID_ANY);
- if (!priv)
- return -ENOMEM;
+ DRV_LOG(DEBUG, "device located port %u address %s", port, address);
snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
+ eth_dev = rte_eth_dev_allocated(name);
+ if (eth_dev) {
+ is_reset = true;
+ priv = eth_dev->data->dev_private;
+ DRV_LOG(DEBUG, "Device reset for eth_dev %p priv %p",
+ eth_dev, priv);
+ } else {
+ priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
+ SOCKET_ID_ANY);
+ if (!priv)
+ return -ENOMEM;
+ }
+
if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
int fd;
@@ -1317,6 +2184,7 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
eth_dev->device = &pci_dev->device;
eth_dev->dev_ops = &mana_dev_secondary_ops;
+
ret = mana_proc_priv_init(eth_dev);
if (ret)
goto failed;
@@ -1336,7 +2204,7 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
goto failed;
}
- /* fd is no not used after mapping doorbell */
+ /* fd is not used after mapping doorbell */
close(fd);
eth_dev->tx_pkt_burst = mana_tx_burst;
@@ -1355,22 +2223,6 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
goto failed;
}
- eth_dev = rte_eth_dev_allocate(name);
- if (!eth_dev) {
- ret = -ENOMEM;
- goto failed;
- }
-
- eth_dev->data->mac_addrs =
- rte_calloc("mana_mac", 1,
- sizeof(struct rte_ether_addr), 0);
- if (!eth_dev->data->mac_addrs) {
- ret = -ENOMEM;
- goto failed;
- }
-
- rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
-
priv->ib_pd = ibv_alloc_pd(ctx);
if (!priv->ib_pd) {
DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
@@ -1390,10 +2242,6 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
}
priv->ib_ctx = ctx;
- priv->port_id = eth_dev->data->port_id;
- priv->dev_port = port;
- eth_dev->data->dev_private = priv;
- priv->dev_data = eth_dev->data;
priv->max_rx_queues = dev_attr->orig_attr.max_qp;
priv->max_tx_queues = dev_attr->orig_attr.max_qp;
@@ -1415,23 +2263,72 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
name, priv->max_rx_queues, priv->max_rx_desc,
priv->max_send_sge, priv->max_mr_size);
- rte_eth_copy_pci_info(eth_dev, pci_dev);
+ if (!is_reset) {
+ eth_dev = rte_eth_dev_allocate(name);
+ if (!eth_dev) {
+ ret = -ENOMEM;
+ goto failed;
+ }
- /* Create async interrupt handler */
- ret = mana_intr_install(eth_dev, priv);
- if (ret) {
- DRV_LOG(ERR, "Failed to install intr handler");
- goto failed;
+ eth_dev->data->mac_addrs =
+ rte_calloc("mana_mac", 1,
+ sizeof(struct rte_ether_addr), 0);
+ if (!eth_dev->data->mac_addrs) {
+ ret = -ENOMEM;
+ goto failed;
+ }
+
+ rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
+ } else {
+ /*
+ * Reset path.
+ */
+ rte_ether_format_addr(address, RTE_ETHER_ADDR_FMT_SIZE,
+ eth_dev->data->mac_addrs);
+ DRV_LOG(DEBUG, "Found existing eth_dev %p with mac addr %s",
+ eth_dev, address);
+ DRV_LOG(DEBUG, "ib_ctx = %p", priv->ib_ctx);
+ goto out;
}
- eth_dev->device = &pci_dev->device;
+ priv->port_id = eth_dev->data->port_id;
+ priv->dev_port = port;
+ eth_dev->data->dev_private = priv;
+ priv->dev_data = eth_dev->data;
+ rte_atomic_store_explicit(&priv->dev_state, MANA_DEV_ACTIVE,
+ rte_memory_order_release);
+
+ rte_eth_copy_pci_info(eth_dev, pci_dev);
+
+ pthread_mutexattr_init(&mattr);
+ pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
+ pthread_mutex_init(&priv->reset_ops_lock, &mattr);
+ pthread_mutex_init(&priv->reset_cond_mutex, &mattr);
+ pthread_mutexattr_destroy(&mattr);
+
+ pthread_condattr_init(&cattr);
+ pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+ pthread_cond_init(&priv->reset_cond, &cattr);
+ pthread_condattr_destroy(&cattr);
- DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
+ eth_dev->device = &pci_dev->device;
eth_dev->rx_pkt_burst = mana_rx_burst_removed;
eth_dev->tx_pkt_burst = mana_tx_burst_removed;
eth_dev->dev_ops = &mana_dev_ops;
+out:
+ /* Create async interrupt handler */
+ ret = mana_intr_install(eth_dev, priv);
+ if (ret) {
+ DRV_LOG(ERR, "Failed to install intr handler, ret %d", ret);
+ goto failed;
+ }
+ DRV_LOG(INFO, "mana_intr_install succeeded");
+
+ DRV_LOG(INFO, "device %s priv %p dev port %d at port %u",
+ name, priv, priv->dev_port, eth_dev->data->port_id);
+
rte_eth_dev_probing_finish(eth_dev);
return 0;
@@ -1439,20 +2336,29 @@ mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
failed:
/* Free the resource for the port failed */
if (priv) {
- if (priv->ib_parent_pd)
+ if (priv->ib_parent_pd) {
ibv_dealloc_pd(priv->ib_parent_pd);
+ priv->ib_parent_pd = NULL;
+ }
- if (priv->ib_pd)
+ if (priv->ib_pd) {
ibv_dealloc_pd(priv->ib_pd);
+ priv->ib_pd = NULL;
+ }
}
- if (eth_dev)
- rte_eth_dev_release_port(eth_dev);
+ if (!is_reset) {
+ if (eth_dev)
+ rte_eth_dev_release_port(eth_dev);
- rte_free(priv);
+ rte_free(priv);
+ }
- if (ctx)
+ if (ctx) {
ibv_close_device(ctx);
+ if (is_reset && priv)
+ priv->ib_ctx = NULL;
+ }
return ret;
}
@@ -1617,7 +2523,17 @@ mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
static int
mana_dev_uninit(struct rte_eth_dev *dev)
{
- return mana_dev_close(dev);
+ struct mana_priv *priv = dev->data->dev_private;
+ int ret;
+
+ /* Join reset thread before teardown to ensure it has exited
+ * before we destroy the condvar/mutex in free_resources.
+ */
+ mana_join_reset_thread(priv);
+
+ ret = mana_dev_close(dev);
+ mana_dev_free_resources(dev);
+ return ret;
}
/*
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 79cc47b6ab..a7b301484a 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -5,6 +5,8 @@
#ifndef __MANA_H__
#define __MANA_H__
+#include <pthread.h>
+
#define PCI_VENDOR_ID_MICROSOFT 0x1414
#define PCI_DEVICE_ID_MICROSOFT_MANA_PF 0x00b9
#define PCI_DEVICE_ID_MICROSOFT_MANA 0x00ba
@@ -337,6 +339,26 @@ struct mana_process_priv {
void *db_page;
};
+enum mana_device_state {
+ /* Normal running */
+ MANA_DEV_ACTIVE = 0,
+ /* In reset enter processing */
+ MANA_DEV_RESET_ENTER = 1,
+ /*
+ * Reset enter processing completed.
+ * Waiting for reset exit or in reset exit processing.
+ */
+ MANA_DEV_RESET_EXIT = 2,
+ /* Reset failed */
+ MANA_DEV_RESET_FAILED = 3,
+};
+
+/* burst_state bit layout:
+ * Bit 0: in-burst (set by data path CAS 0→1, cleared on exit).
+ * Bit 1: blocked (set by reset path to reject new bursts).
+ */
+#define MANA_BURST_BLOCKED 2
+
struct mana_priv {
struct rte_eth_dev_data *dev_data;
struct mana_process_priv *process_priv;
@@ -368,6 +390,15 @@ struct mana_priv {
uint64_t max_mr_size;
struct mana_mr_btree mr_btree;
rte_spinlock_t mr_btree_lock;
+ RTE_ATOMIC(enum mana_device_state) dev_state;
+ /* mutex for synchronizing mana reset and some mana_dev_ops callbacks */
+ pthread_mutex_t reset_ops_lock;
+ /* Reset thread ID, valid when reset_thread_active is true */
+ rte_thread_t reset_thread;
+ RTE_ATOMIC(bool) reset_thread_active;
+ /* Condvar to wake reset thread early on PCI remove */
+ pthread_mutex_t reset_cond_mutex;
+ pthread_cond_t reset_cond;
};
struct mana_txq_desc {
@@ -427,6 +458,14 @@ struct mana_txq {
struct mana_mr_btree mr_btree;
struct mana_stats stats;
unsigned int socket;
+ unsigned int txq_idx;
+
+ /*
+ * Bit 0: in-burst flag (set by data path, cleared on exit).
+ * Bit 1: blocked flag (set by reset path via fetch_or).
+ * Data path CAS 0→1 to enter; fails if blocked bit is set.
+ */
+ RTE_ATOMIC(uint32_t) burst_state;
};
struct mana_rxq {
@@ -462,6 +501,14 @@ struct mana_rxq {
struct mana_mr_btree mr_btree;
unsigned int socket;
+ unsigned int rxq_idx;
+
+ /*
+ * Bit 0: in-burst flag (set by data path, cleared on exit).
+ * Bit 1: blocked flag (set by reset path via fetch_or).
+ * Data path CAS 0→1 to enter; fails if blocked bit is set.
+ */
+ RTE_ATOMIC(uint32_t) burst_state;
};
extern int mana_logtype_driver;
@@ -543,6 +590,8 @@ enum mana_mp_req_type {
MANA_MP_REQ_CREATE_MR,
MANA_MP_REQ_START_RXTX,
MANA_MP_REQ_STOP_RXTX,
+ MANA_MP_REQ_RESET_ENTER,
+ MANA_MP_REQ_RESET_EXIT,
};
/* Pameters for IPC. */
@@ -563,8 +612,9 @@ void mana_mp_uninit_primary(void);
void mana_mp_uninit_secondary(void);
int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
int mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len);
+int mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd);
-void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
+int mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
void *mana_alloc_verbs_buf(size_t size, void *data);
void mana_free_verbs_buf(void *ptr, void *data __rte_unused);
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index 72417fc0c7..1670f1ea9c 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -2,10 +2,13 @@
* Copyright 2022 Microsoft Corporation
*/
+#include <sys/mman.h>
#include <rte_malloc.h>
#include <ethdev_driver.h>
#include <rte_log.h>
+#include <rte_eal_paging.h>
#include <stdlib.h>
+#include <unistd.h>
#include <infiniband/verbs.h>
@@ -119,6 +122,23 @@ mana_mp_primary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
return ret;
}
+static int
+mana_mp_reset_enter(struct rte_eth_dev *dev)
+{
+ struct mana_process_priv *proc_priv = dev->process_private;
+
+ void *addr = proc_priv->db_page;
+
+ /* Reset the db_page to NULL */
+ proc_priv->db_page = NULL;
+
+ if (addr)
+ (void)munmap(addr, rte_mem_page_size());
+
+ DRV_LOG(DEBUG, "Secondary doorbell pages unmapped");
+ return 0;
+}
+
static int
mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
{
@@ -171,6 +191,52 @@ mana_mp_secondary_handle(const struct rte_mp_msg *mp_msg, const void *peer)
ret = rte_mp_reply(&mp_res, peer);
break;
+ case MANA_MP_REQ_RESET_ENTER:
+ DRV_LOG(INFO, "Port %u reset enter", dev->data->port_id);
+ res->result = mana_mp_reset_enter(dev);
+
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+
+ case MANA_MP_REQ_RESET_EXIT:
+ DRV_LOG(INFO, "Port %u reset exit", dev->data->port_id);
+ {
+ struct mana_process_priv *proc_priv =
+ dev->process_private;
+
+ if (proc_priv->db_page != NULL) {
+ DRV_LOG(DEBUG,
+ "Secondary doorbell already "
+ "mapped to %p",
+ proc_priv->db_page);
+ res->result = 0;
+ } else if (mp_msg->num_fds < 1) {
+ DRV_LOG(ERR,
+ "No FD in RESET_EXIT message");
+ res->result = -EINVAL;
+ } else {
+ ret = mana_map_doorbell_secondary(dev,
+ mp_msg->fds[0]);
+ if (ret) {
+ DRV_LOG(ERR,
+ "Failed secondary "
+ "doorbell map %d",
+ mp_msg->fds[0]);
+ res->result = -ENODEV;
+ } else {
+ res->result = 0;
+ }
+ }
+
+ /* Close the fd whenever present, even if
+ * db_page was already mapped.
+ */
+ if (mp_msg->num_fds >= 1)
+ close(mp_msg->fds[0]);
+ }
+ ret = rte_mp_reply(&mp_res, peer);
+ break;
+
default:
DRV_LOG(ERR, "Port %u unknown secondary MP type %u",
param->port_id, param->type);
@@ -254,7 +320,7 @@ mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev)
}
ret = mp_res->fds[0];
- DRV_LOG(ERR, "port %u command FD from primary is %d",
+ DRV_LOG(DEBUG, "port %u command FD from primary is %d",
dev->data->port_id, ret);
exit:
free(mp_rep.msgs);
@@ -298,27 +364,36 @@ mana_mp_req_mr_create(struct mana_priv *priv, uintptr_t addr, uint32_t len)
return ret;
}
-void
+int
mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
{
struct rte_mp_msg mp_req = { 0 };
struct rte_mp_msg *mp_res;
- struct rte_mp_reply mp_rep;
+ struct rte_mp_reply mp_rep = { 0 };
struct mana_mp_param *res;
struct timespec ts = {.tv_sec = MANA_MP_REQ_TIMEOUT_SEC, .tv_nsec = 0};
- int i, ret;
+ int i, ret = 0;
- if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX) {
+ if (type != MANA_MP_REQ_START_RXTX && type != MANA_MP_REQ_STOP_RXTX &&
+ type != MANA_MP_REQ_RESET_ENTER && type != MANA_MP_REQ_RESET_EXIT) {
DRV_LOG(ERR, "port %u unknown request (req_type %d)",
dev->data->port_id, type);
- return;
+ return -EINVAL;
}
if (rte_atomic_load_explicit(&mana_shared_data->secondary_cnt, rte_memory_order_relaxed) == 0)
- return;
+ return 0;
mp_init_msg(&mp_req, type, dev->data->port_id);
+ /* Include IB cmd FD for secondary doorbell remap */
+ if (type == MANA_MP_REQ_RESET_EXIT) {
+ struct mana_priv *priv = dev->data->dev_private;
+
+ mp_req.num_fds = 1;
+ mp_req.fds[0] = priv->ib_ctx->cmd_fd;
+ }
+
ret = rte_mp_request_sync(&mp_req, &mp_rep, &ts);
if (ret) {
if (rte_errno != ENOTSUP)
@@ -329,6 +404,7 @@ mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
if (mp_rep.nb_sent != mp_rep.nb_received) {
DRV_LOG(ERR, "port %u not all secondaries responded (%d)",
dev->data->port_id, type);
+ ret = -ETIMEDOUT;
goto exit;
}
for (i = 0; i < mp_rep.nb_received; i++) {
@@ -337,9 +413,11 @@ mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type)
if (res->result) {
DRV_LOG(ERR, "port %u request failed on secondary %d",
dev->data->port_id, i);
+ ret = res->result;
goto exit;
}
}
exit:
free(mp_rep.msgs);
+ return ret;
}
diff --git a/drivers/net/mana/mr.c b/drivers/net/mana/mr.c
index c4045141bc..8914f4cf04 100644
--- a/drivers/net/mana/mr.c
+++ b/drivers/net/mana/mr.c
@@ -314,8 +314,10 @@ mana_mr_btree_init(struct mana_mr_btree *bt, int n, int socket)
void
mana_mr_btree_free(struct mana_mr_btree *bt)
{
- rte_free(bt->table);
- memset(bt, 0, sizeof(*bt));
+ if (bt && bt->table) {
+ rte_free(bt->table);
+ memset(bt, 0, sizeof(*bt));
+ }
}
int
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index 1b8ba1f3a9..aedb05d46f 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -36,6 +36,11 @@ mana_rq_ring_doorbell(struct mana_rxq *rxq)
db_page = process_priv->db_page;
}
+ if (!db_page) {
+ DP_LOG(ERR, "db_page is NULL, cannot ring RX doorbell");
+ return -EINVAL;
+ }
+
/* Hardware Spec specifies that software client should set 0 for
* wqe_cnt for Receive Queues.
*/
@@ -172,7 +177,7 @@ mana_stop_rx_queues(struct rte_eth_dev *dev)
for (i = 0; i < priv->num_queues; i++)
if (dev->data->rx_queue_state[i] == RTE_ETH_QUEUE_STATE_STOPPED)
- return -EINVAL;
+ return 0;
if (priv->rwq_qp) {
ret = ibv_destroy_qp(priv->rwq_qp);
@@ -256,6 +261,9 @@ mana_start_rx_queues(struct rte_eth_dev *dev)
struct mana_rxq *rxq = dev->data->rx_queues[i];
struct ibv_wq_init_attr wq_attr = {};
+ rxq->rxq_idx = i;
+ DRV_LOG(DEBUG, "assigning rxq_idx to %d", i);
+
manadv_set_context_attr(priv->ib_ctx,
MANADV_CTX_ATTR_BUF_ALLOCATORS,
(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -451,6 +459,16 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
uint32_t pkt_len;
uint32_t i;
int polled = 0;
+ uint32_t expected = 0;
+
+ /* Single atomic CAS: enter burst only if device is active (0→1).
+ * Fails immediately if reset path has set the blocked bit.
+ */
+ if (unlikely(!rte_atomic_compare_exchange_strong_explicit(
+ &rxq->burst_state, &expected, 1,
+ rte_memory_order_acquire,
+ rte_memory_order_relaxed)))
+ return 0;
repoll:
/* Polling on new completions if we have no backlog */
@@ -592,6 +610,9 @@ mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
wqe_consumed, ret);
}
+ rte_atomic_fetch_and_explicit(&rxq->burst_state, ~(uint32_t)1,
+ rte_memory_order_release);
+
return pkt_received;
}
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
index 57dbbc3651..10f2212b5d 100644
--- a/drivers/net/mana/tx.c
+++ b/drivers/net/mana/tx.c
@@ -17,7 +17,7 @@ mana_stop_tx_queues(struct rte_eth_dev *dev)
for (i = 0; i < priv->num_queues; i++)
if (dev->data->tx_queue_state[i] == RTE_ETH_QUEUE_STATE_STOPPED)
- return -EINVAL;
+ return 0;
for (i = 0; i < priv->num_queues; i++) {
struct mana_txq *txq = dev->data->tx_queues[i];
@@ -83,6 +83,9 @@ mana_start_tx_queues(struct rte_eth_dev *dev)
txq = dev->data->tx_queues[i];
+ txq->txq_idx = i;
+ DRV_LOG(DEBUG, "assigning txq_idx to %d", txq->txq_idx);
+
manadv_set_context_attr(priv->ib_ctx,
MANADV_CTX_ATTR_BUF_ALLOCATORS,
(void *)((uintptr_t)&(struct manadv_ctx_allocators){
@@ -190,10 +193,34 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
void *db_page;
uint16_t pkt_sent = 0;
uint32_t num_comp, i;
+ uint32_t expected = 0;
#ifdef RTE_ARCH_32
uint32_t wqe_count = 0;
#endif
+ db_page = priv->db_page;
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ struct rte_eth_dev *dev =
+ &rte_eth_devices[priv->dev_data->port_id];
+ struct mana_process_priv *process_priv = dev->process_private;
+
+ db_page = process_priv->db_page;
+ }
+
+ /* Single atomic CAS: enter burst only if device is active (0→1).
+ * Fails immediately if reset path has set the blocked bit.
+ */
+ if (unlikely(!rte_atomic_compare_exchange_strong_explicit(
+ &txq->burst_state, &expected, 1,
+ rte_memory_order_acquire,
+ rte_memory_order_relaxed) || !db_page)) {
+ if (!expected) /* CAS succeeded but db_page NULL — undo */
+ rte_atomic_fetch_and_explicit(&txq->burst_state,
+ ~(uint32_t)1,
+ rte_memory_order_release);
+ return 0;
+ }
+
/* Process send completions from GDMA */
num_comp = gdma_poll_completion_queue(&txq->gdma_cq,
txq->gdma_comp_buf, txq->num_desc);
@@ -216,7 +243,8 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
}
if (!desc->pkt) {
- DP_LOG(ERR, "mana_txq_desc has a NULL pkt");
+ DP_LOG(ERR, "mana_txq_desc has a NULL pkt, priv %p, "
+ "txq = %d", priv, txq->txq_idx);
} else {
txq->stats.bytes += desc->pkt->pkt_len;
rte_pktmbuf_free(desc->pkt);
@@ -474,15 +502,6 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
}
/* Ring hardware door bell */
- db_page = priv->db_page;
- if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
- struct rte_eth_dev *dev =
- &rte_eth_devices[priv->dev_data->port_id];
- struct mana_process_priv *process_priv = dev->process_private;
-
- db_page = process_priv->db_page;
- }
-
if (pkt_sent) {
#ifdef RTE_ARCH_32
ret = mana_ring_short_doorbell(db_page, GDMA_QUEUE_SEND,
@@ -501,5 +520,8 @@ mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
DP_LOG(ERR, "mana_ring_doorbell failed ret %d", ret);
}
+ rte_atomic_fetch_and_explicit(&txq->burst_state, ~(uint32_t)1,
+ rte_memory_order_release);
+
return pkt_sent;
}
--
2.34.1
^ permalink raw reply related
* Re: [RFC 1/4] telemetry: allow commands to receive file descriptors
From: Bruce Richardson @ 2026-06-16 12:32 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: dev
In-Reply-To: <20260609210540.768074-2-stephen@networkplumber.org>
On Tue, Jun 09, 2026 at 02:02:02PM -0700, Stephen Hemminger wrote:
> Add rte_telemetry_register_cmd_fd_arg() to register a command whose
> callback also receives file descriptors passed by the client as
> SCM_RIGHTS ancillary data. The callback owns the descriptors and must
> close them.
>
> This lets a client open a file itself and hand the descriptor to the
> primary process, so DPDK never opens the path. That avoids path and
> permission problems and works across container filesystem namespaces.
>
> Existing commands and clients are unaffected. If unsolicited file
> descriptor is passed, it is closed.
>
This scheme seems reasonable in general. My only concern is whether the
lack of potential windows support is an issue? For regular telemetry, there
was always the option of a windows implementation using regular
TCP/UDP/SCTP sockets bound to localhost. However, AFAIK there is no windows
implementation of anything that supports file descriptors or handles
between processes.
Some other pieces of feedback inline below.
/Bruce
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
> doc/guides/rel_notes/release_26_07.rst | 5 ++
> lib/telemetry/rte_telemetry.h | 66 ++++++++++++++
> lib/telemetry/telemetry.c | 115 ++++++++++++++++++++++---
> 3 files changed, 174 insertions(+), 12 deletions(-)
>
> diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
> index b5285af5fe..d7a2df88c1 100644
> --- a/doc/guides/rel_notes/release_26_07.rst
> +++ b/doc/guides/rel_notes/release_26_07.rst
> @@ -141,6 +141,11 @@ New Features
> Added AGENTS.md file for AI review
> and supporting scripts to review patches and documentation.
>
> +* **Added telemetry support for passing file descriptors.**
> +
> + Add experimental telemetry callback ``rte_telemetry_register_cmd_fd_arg()``
> + to allow command to receive file descriptors passed by client.
> +
>
> Removed Items
> -------------
> diff --git a/lib/telemetry/rte_telemetry.h b/lib/telemetry/rte_telemetry.h
> index 0a58e518f7..3e32d2902b 100644
> --- a/lib/telemetry/rte_telemetry.h
> +++ b/lib/telemetry/rte_telemetry.h
> @@ -325,6 +325,37 @@ typedef int (*telemetry_cb)(const char *cmd, const char *params,
> typedef int (*telemetry_arg_cb)(const char *cmd, const char *params, void *arg,
> struct rte_tel_data *info);
>
> +/**
> + * This telemetry callback is used when registering a telemetry command with
> + * rte_telemetry_register_cmd_fd_arg().
> + *
> + * It behaves like telemetry_arg_cb, but additionally receives any file
> + * descriptors the client passed alongside the command as SCM_RIGHTS ancillary
> + * data. The callback takes ownership of these descriptors and is responsible
> + * for closing them.
> + *
> + * @param cmd
> + * The cmd that was requested by the client.
> + * @param params
> + * Contains data required by the callback function.
> + * @param arg
> + * The opaque value that was passed to rte_telemetry_register_cmd_fd_arg().
> + * @param fds
> + * Array of file descriptors received from the client. May be NULL when
> + * n_fds is zero.
> + * @param n_fds
> + * Number of file descriptors in the fds array.
> + * @param info
> + * The information to be returned to the caller.
> + *
> + * @return
> + * Length of buffer used on success.
> + * @return
> + * Negative integer on error.
> + */
> +typedef int (*telemetry_fd_cb)(const char *cmd, const char *params, void *arg,
> + const int *fds, unsigned int n_fds, struct rte_tel_data *info);
> +
Do we anticipate in future having callbacks taking more than one FD? Would
it not be simpler just to a single fd parameter (which is -1 on no fd
passed).
> /**
> * Used when registering a command and callback function with telemetry.
> *
> @@ -368,6 +399,41 @@ __rte_experimental
> int
> rte_telemetry_register_cmd_arg(const char *cmd, telemetry_arg_cb fn, void *arg, const char *help);
>
> +/**
> + * Register a command and a file-descriptor-aware callback with telemetry.
> + *
> + * The callback is invoked like rte_telemetry_register_cmd_arg(), but also
> + * receives any file descriptors the client passed alongside the command as
> + * SCM_RIGHTS ancillary data. This lets a client open a file (for example a
> + * capture output file) itself and hand the descriptor to the DPDK process,
> + * which never opens the path - avoiding path and permission concerns and
> + * working across container filesystem namespaces.
> + *
> + * Descriptors sent to a command registered with rte_telemetry_register_cmd()
> + * or rte_telemetry_register_cmd_arg() are rejected and the connection is
> + * closed.
> + *
> + * @param cmd
> + * The command to register with telemetry.
> + * @param fn
> + * Callback function to be called when the command is requested.
> + * @param arg
> + * An opaque value that will be passed to the callback function.
> + * @param help
> + * Help text for the command.
> + *
> + * @return
> + * 0 on success.
> + * @return
> + * -EINVAL for invalid parameters failure.
> + * @return
> + * -ENOMEM for mem allocation failure.
> + */
> +__rte_experimental
> +int
> +rte_telemetry_register_cmd_fd_arg(const char *cmd, telemetry_fd_cb fn, void *arg,
> + const char *help);
> +
Do we want to make this experimental for later stabilization, or is this an
API that is best kept as internal-only? I'd tend towards keeping it
internal-only, rather than allowing apps to define callbacks which take
FDs.
> /**
> * @internal
> * Free a container that has memory allocated.
> diff --git a/lib/telemetry/telemetry.c b/lib/telemetry/telemetry.c
> index b109d076d4..30d3ae3a13 100644
> --- a/lib/telemetry/telemetry.c
> +++ b/lib/telemetry/telemetry.c
> @@ -29,6 +29,8 @@
> #define MAX_CMD_LEN 56
> #define MAX_OUTPUT_LEN (1024 * 16)
> #define MAX_CONNECTIONS 10
> +/* Maximum number of file descriptors a client may pass with one command. */
> +#define MAX_FDS 8
>
As above, do we really need multiple FDs?
> #ifndef RTE_EXEC_ENV_WINDOWS
> static void *
> @@ -39,6 +41,7 @@ struct cmd_callback {
> char cmd[MAX_CMD_LEN];
> telemetry_cb fn;
> telemetry_arg_cb fn_arg;
> + telemetry_fd_cb fn_fd;
> void *arg;
> char help[RTE_TEL_MAX_STRING_LEN];
> };
> @@ -72,15 +75,15 @@ static RTE_ATOMIC(uint16_t) v2_clients;
> #endif /* !RTE_EXEC_ENV_WINDOWS */
>
> static int
> -register_cmd(const char *cmd, const char *help,
> - telemetry_cb fn, telemetry_arg_cb fn_arg, void *arg)
> +register_cmd(const char *cmd, const char *help, telemetry_cb fn,
> + telemetry_arg_cb fn_arg, telemetry_fd_cb fn_fd, void *arg)
> {
> struct cmd_callback *new_callbacks;
> const char *cmdp = cmd;
> int i = 0;
>
> - if (strlen(cmd) >= MAX_CMD_LEN || (fn == NULL && fn_arg == NULL) || cmd[0] != '/'
> - || strlen(help) >= RTE_TEL_MAX_STRING_LEN)
> + if (strlen(cmd) >= MAX_CMD_LEN || (fn == NULL && fn_arg == NULL && fn_fd == NULL)
> + || cmd[0] != '/' || strlen(help) >= RTE_TEL_MAX_STRING_LEN)
> return -EINVAL;
>
> while (*cmdp != '\0') {
> @@ -107,6 +110,7 @@ register_cmd(const char *cmd, const char *help,
> strlcpy(callbacks[i].cmd, cmd, MAX_CMD_LEN);
> callbacks[i].fn = fn;
> callbacks[i].fn_arg = fn_arg;
> + callbacks[i].fn_fd = fn_fd;
> callbacks[i].arg = arg;
> strlcpy(callbacks[i].help, help, RTE_TEL_MAX_STRING_LEN);
> num_callbacks++;
> @@ -119,14 +123,22 @@ RTE_EXPORT_SYMBOL(rte_telemetry_register_cmd)
> int
> rte_telemetry_register_cmd(const char *cmd, telemetry_cb fn, const char *help)
> {
> - return register_cmd(cmd, help, fn, NULL, NULL);
> + return register_cmd(cmd, help, fn, NULL, NULL, NULL);
> }
>
> RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_telemetry_register_cmd_arg, 24.11)
> int
> rte_telemetry_register_cmd_arg(const char *cmd, telemetry_arg_cb fn, void *arg, const char *help)
> {
> - return register_cmd(cmd, help, NULL, fn, arg);
> + return register_cmd(cmd, help, NULL, fn, NULL, arg);
> +}
> +
> +RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_telemetry_register_cmd_fd_arg, 26.07)
> +int
> +rte_telemetry_register_cmd_fd_arg(const char *cmd, telemetry_fd_cb fn, void *arg,
> + const char *help)
> +{
> + return register_cmd(cmd, help, NULL, NULL, fn, arg);
> }
>
> #ifndef RTE_EXEC_ENV_WINDOWS
> @@ -368,13 +380,70 @@ output_json(const char *cmd, const struct rte_tel_data *d, int s)
> TMTY_LOG_LINE(ERR, "Error writing to socket: %s", strerror(errno));
> }
>
> +/*
> + * Receive a command and any file descriptors the client passed alongside it
> + * as SCM_RIGHTS ancillary data. The payload length is returned (0 if the
> + * client sent an empty message or closed the connection, negative on error).
> + * Descriptors that arrive are returned in fds[]/n_fds and are owned by the
> + * caller. MSG_CTRUNC means more descriptors were sent than the control buffer
> + * could hold; *ctrunc is set so the caller can reject the command, but the
> + * descriptors that did fit are still returned so they can be closed rather
> + * than leaked.
> + */
> +static int
> +recv_with_fds(int s, char *buf, size_t buf_len, int *fds, unsigned int *n_fds,
> + bool *ctrunc)
> +{
> + char cmsgbuf[CMSG_SPACE(sizeof(int) * MAX_FDS)];
> + struct iovec iov = { .iov_base = buf, .iov_len = buf_len };
> + struct msghdr msg = {
> + .msg_iov = &iov,
> + .msg_iovlen = 1,
> + .msg_control = cmsgbuf,
> + .msg_controllen = sizeof(cmsgbuf),
> + };
> + struct cmsghdr *cmsg;
> + int bytes;
> +
> + *n_fds = 0;
> + *ctrunc = false;
> +
> + bytes = recvmsg(s, &msg, 0);
> + if (bytes < 0)
> + return bytes;
> +
> + if (msg.msg_flags & MSG_CTRUNC)
> + *ctrunc = true;
> +
> + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
> + if (cmsg->cmsg_level != SOL_SOCKET || cmsg->cmsg_type != SCM_RIGHTS)
> + continue;
> + *n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
> + memcpy(fds, CMSG_DATA(cmsg), *n_fds * sizeof(int));
> + break;
> + }
> + return bytes;
> +}
> +
> static void
> -perform_command(const struct cmd_callback *cb, const char *cmd, const char *param, int s)
> +close_fds(const int *fds, unsigned int n_fds)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < n_fds; i++)
> + close(fds[i]);
> +}
> +
> +static void
> +perform_command(const struct cmd_callback *cb, const char *cmd, const char *param,
> + const int *fds, unsigned int n_fds, int s)
> {
> struct rte_tel_data data = {0};
> int ret;
>
> - if (cb->fn_arg != NULL)
> + if (cb->fn_fd != NULL)
> + ret = cb->fn_fd(cmd, param, cb->arg, fds, n_fds, &data);
> + else if (cb->fn_arg != NULL)
> ret = cb->fn_arg(cmd, param, cb->arg, &data);
> else
> ret = cb->fn(cmd, param, &data);
> @@ -412,8 +481,11 @@ client_handler(void *sock_id)
> }
>
> /* receive data is not null terminated */
> - int bytes = read(s, buffer, sizeof(buffer) - 1);
> - while (bytes > 0) {
> + int fds[MAX_FDS];
> + unsigned int n_fds = 0;
> + bool ctrunc = false;
> + int bytes = recv_with_fds(s, buffer, sizeof(buffer) - 1, fds, &n_fds, &ctrunc);
> + while (bytes > 0 || (bytes == 0 && n_fds > 0)) {
> buffer[bytes] = 0;
> const char *cmd = strtok(buffer, ",");
> const char *param = strtok(NULL, "\0");
> @@ -429,9 +501,28 @@ client_handler(void *sock_id)
> }
> rte_spinlock_unlock(&callback_sl);
> }
> - perform_command(&cb, cmd, param, s);
>
> - bytes = read(s, buffer, sizeof(buffer) - 1);
> + /*
> + * File descriptors go only to a command that registered to
> + * receive them. A command that did not, or a truncated control
> + * message, is a client error: close the descriptors and drop the
> + * connection rather than silently discarding them.
> + */
> + if (n_fds > 0 && (cb.fn_fd == NULL || ctrunc)) {
> + TMTY_LOG_LINE(ERR,
> + "Closing connection: %u file descriptor(s) passed to '%s'%s",
> + n_fds, cmd ? cmd : "(none)",
> + ctrunc ? " (truncated)" : " which does not accept them");
> + close_fds(fds, n_fds);
> + break;
> + }
> +
> + /* an fd-aware callback takes ownership of the descriptors */
> + perform_command(&cb, cmd, param, fds, n_fds, s);
> +
> + n_fds = 0;
> + ctrunc = false;
the receive function always resets this to false anyway, so you may be able
to omit this line (assuming compiler doesn't complain).
> + bytes = recv_with_fds(s, buffer, sizeof(buffer) - 1, fds, &n_fds, &ctrunc);
> }
> exit:
> close(s);
> --
> 2.53.0
>
^ permalink raw reply
* [DPDK/other Bug 952] unit tests fail when machine has more than 128 cores
From: bugzilla @ 2026-06-16 12:34 UTC (permalink / raw)
To: dev
In-Reply-To: <bug-952-3@http.bugs.dpdk.org/>
http://bugs.dpdk.org/show_bug.cgi?id=952
Thomas Monjalon (thomas@monjalon.net) changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |FIXED
Status|UNCONFIRMED |RESOLVED
--- Comment #1 from Thomas Monjalon (thomas@monjalon.net) ---
Resolved in https://dpdk.org/id/c9eb695f16
test/atomic: scale test based on core count
--
You are receiving this mail because:
You are the assignee for the bug.
^ permalink raw reply
* Re: [RFC 0/4] alternative capture mechanism
From: Bruce Richardson @ 2026-06-16 12:37 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: dev
In-Reply-To: <20260609210540.768074-1-stephen@networkplumber.org>
On Tue, Jun 09, 2026 at 02:02:01PM -0700, Stephen Hemminger wrote:
> This is an RFC for an alternative way to capture packets from a DPDK
> application. I did brief demo of similar mechanism at DPDK summit but
> this is more complete. Capture runs in the primary process and is driven
> entirely over telemetry; no secondary process is involved.
>
> A client asks the application to start capturing and passes it a file
> descriptor to write to. The application writes pcapng to that descriptor.
> A Wireshark extcap script is the intended front end, but the control path
> is just telemetry and the output is just a pipe, so other front ends are
> possible.
>
> 1/4 telemetry: let a command receive file descriptors from the client
> 2/4 capture: the library
> 3/4 test: functional test
> 4/4 app: the Wireshark extcap script and its documentation
>
> Setup and usage are in doc/guides/tools/wireshark_extcap.rst.
>
> Primary process only for now; secondary-process capture is possible as
> follow-on. Posting as RFC to get feedback on the approach.
>
> The extcap script is dual licensed (BSD-3-Clause OR GPL-2.0-or-later) as
> it may be more useful in the Wireshark tree.
>
One concern I have though - does this cause system-calls to be made in the
fast-path because we are writting to a passed in FD? For performance
reasons, would it not be better to use a memory buffer for this, thereby
avoiding syscalls? For example, rather than passing in an FD to telemetry,
we could pass in a key to be passed to shmget (going old-school!), or
name parameter for shm_open. Thereafter with the memory buffer we can use a
circular ring or similar to pass the data from app to client.
/Bruce
> Stephen Hemminger (4):
> telemetry: allow commands to receive file descriptors
> capture: infrastructure wireshark packet capture
> test: add test for capture hooks
> usertools/dpdk-wireshark-extcap.py: script for external capture
>
> MAINTAINERS | 4 +
> app/test/meson.build | 1 +
> app/test/test_capture.c | 365 +++++++++++
> doc/guides/rel_notes/release_26_07.rst | 12 +
> doc/guides/tools/index.rst | 1 +
> doc/guides/tools/wireshark_extcap.rst | 155 +++++
> lib/capture/capture.c | 821 +++++++++++++++++++++++++
> lib/capture/capture_impl.h | 56 ++
> lib/capture/filter.c | 108 ++++
> lib/capture/meson.build | 19 +
> lib/meson.build | 1 +
> lib/telemetry/rte_telemetry.h | 66 ++
> lib/telemetry/telemetry.c | 115 +++-
> usertools/dpdk-wireshark-extcap.py | 274 +++++++++
> 14 files changed, 1986 insertions(+), 12 deletions(-)
> create mode 100644 app/test/test_capture.c
> create mode 100644 doc/guides/tools/wireshark_extcap.rst
> create mode 100644 lib/capture/capture.c
> create mode 100644 lib/capture/capture_impl.h
> create mode 100644 lib/capture/filter.c
> create mode 100644 lib/capture/meson.build
> create mode 100755 usertools/dpdk-wireshark-extcap.py
>
> --
> 2.53.0
>
^ permalink raw reply
* [v4] net/cksum: compute raw cksum for several segments
From: Su Sai @ 2026-06-16 12:38 UTC (permalink / raw)
To: stephen; +Cc: dev, spiderdetective.ss
In-Reply-To: <20260608100202.0deac83d@phoenix.local>
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset=y, Size: 7012 bytes --]
The rte_raw_cksum_mbuf function is used to compute
the raw checksum of a packet.
If the packet payload stored in multi mbuf, the function
will goto the hard case. In hard case,
the variable 'tmp' is a type of uint32_t,
so rte_bswap16 will drop high 16 bit.
Meanwhile, the variable 'sum' is a type of uint32_t,
so 'sum += tmp' will drop the carry when overflow.
Both drop will make cksum incorrect.
This commit fixes the above bug.
Signed-off-by: Su Sai <spiderdetective.ss@gmail.com>
---
.mailmap | 1 +
app/test/test_cksum.c | 102 ++++++++++++++++++++++++++++++++++++++++++
lib/net/rte_cksum.h | 27 +++++++++--
3 files changed, 126 insertions(+), 4 deletions(-)
diff --git a/.mailmap b/.mailmap
index 4001e5fb0e..bcf73cb902 100644
--- a/.mailmap
+++ b/.mailmap
@@ -1630,6 +1630,7 @@ Sylvia Grundwürmer <sylvia.grundwuermer@b-plus.com>
Sylwester Dziedziuch <sylwesterx.dziedziuch@intel.com>
Sylwia Wnuczko <sylwia.wnuczko@intel.com>
Szymon Sliwa <szs@semihalf.com>
+Su Sai <spiderdetective.ss@gmail.com> <susai.ss@bytedance.com>
Szymon T Cudzilo <szymon.t.cudzilo@intel.com>
Tadhg Kearney <tadhg.kearney@intel.com>
Taekyung Kim <kim.tae.kyung@navercorp.com>
diff --git a/app/test/test_cksum.c b/app/test/test_cksum.c
index ea443382a1..5bd9723fbd 100644
--- a/app/test/test_cksum.c
+++ b/app/test/test_cksum.c
@@ -85,6 +85,42 @@ static const char test_cksum_ipv4_opts_udp[] = {
0x00, 0x35, 0x00, 0x09, 0x89, 0x6f, 0x78,
};
+/*
+ * generated in scapy with
+ * Ether()/IP()/TCP(options=[NOP,NOP,Timestamps])/os.urandom(113))
+ */
+static const char test_cksum_ipv4_tcp_multi_segs[] = {
+ 0x00, 0x16, 0x3e, 0x0b, 0x6b, 0xd2, 0xee, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x08, 0x00, 0x45, 0x00,
+ 0x00, 0xa5, 0x46, 0x10, 0x40, 0x00, 0x40, 0x06,
+ 0x80, 0xb5, 0xc0, 0xa8, 0xf9, 0x1d, 0xc0, 0xa8,
+ 0xf9, 0x1e, 0xdc, 0xa2, 0x14, 0x51, 0xbb, 0x8f,
+ 0xa0, 0x00, 0xe4, 0x7c, 0xe4, 0xb8, 0x80, 0x10,
+ 0x02, 0x00, 0x4b, 0xc1, 0x00, 0x00, 0x01, 0x01,
+ 0x08, 0x0a, 0x90, 0x60, 0xf4, 0xff, 0x03, 0xc5,
+ 0xb4, 0x19, 0x77, 0x34, 0xd4, 0xdc, 0x84, 0x86,
+ 0xff, 0x44, 0x09, 0x63, 0x36, 0x2e, 0x26, 0x9b,
+ 0x90, 0x70, 0xf2, 0xed, 0xc8, 0x5b, 0x87, 0xaa,
+ 0xb4, 0x67, 0x6b, 0x32, 0x3d, 0xc4, 0xbf, 0x15,
+ 0xa9, 0x16, 0x6c, 0x2a, 0x9d, 0xb2, 0xb7, 0x6b,
+ 0x58, 0x44, 0x58, 0x12, 0x4b, 0x8f, 0xe5, 0x12,
+ 0x11, 0x90, 0x94, 0x68, 0x37, 0xad, 0x0a, 0x9b,
+ 0xd6, 0x79, 0xf2, 0xb7, 0x31, 0xcf, 0x44, 0x22,
+ 0xc8, 0x99, 0x3f, 0xe5, 0xe7, 0xac, 0xc7, 0x0b,
+ 0x86, 0xdf, 0xda, 0xed, 0x0a, 0x0f, 0x86, 0xd7,
+ 0x48, 0xe2, 0xf1, 0xc2, 0x43, 0xed, 0x47, 0x3a,
+ 0xea, 0x25, 0x2d, 0xd6, 0x60, 0x38, 0x30, 0x07,
+ 0x28, 0xdd, 0x1f, 0x0c, 0xdd, 0x7b, 0x7c, 0xd9,
+ 0x35, 0x9d, 0x14, 0xaa, 0xc6, 0x35, 0xd1, 0x03,
+ 0x38, 0xb1, 0xf5,
+};
+
+static const uint8_t test_cksum_ipv4_tcp_multi_segs_len[] = {
+ 66, /* the first seg contains all headers, including L2 to L4 */
+ 61, /* the second seg length is odd, test byte order independent */
+ 52, /* three segs are sufficient to test the most complex scenarios */
+};
+
/* test l3/l4 checksum api */
static int
test_l4_cksum(struct rte_mempool *pktmbuf_pool, const char *pktdata, size_t len)
@@ -223,6 +259,66 @@ test_l4_cksum(struct rte_mempool *pktmbuf_pool, const char *pktdata, size_t len)
return -1;
}
+/* test l4 checksum api for a packet with multiple mbufs */
+static int
+test_l4_cksum_multi_mbufs(struct rte_mempool *pktmbuf_pool, const char *pktdata, size_t len,
+ const uint8_t *segs, size_t segs_len)
+{
+ struct rte_mbuf *m[NB_MBUF] = {0};
+ struct rte_mbuf *m_hdr = NULL;
+ struct rte_net_hdr_lens hdr_lens;
+ size_t i, off = 0;
+ uint32_t packet_type, l3;
+ void *l3_hdr;
+ char *data;
+
+ for (i = 0; i < segs_len; i++) {
+ m[i] = rte_pktmbuf_alloc(pktmbuf_pool);
+ if (m[i] == NULL)
+ GOTO_FAIL("Cannot allocate mbuf");
+
+ data = rte_pktmbuf_append(m[i], segs[i]);
+ if (data == NULL)
+ GOTO_FAIL("Cannot append data");
+
+ memcpy(data, pktdata + off, segs[i]);
+ off += segs[i];
+
+ if (m_hdr) {
+ if (rte_pktmbuf_chain(m_hdr, m[i]))
+ GOTO_FAIL("Cannot chain mbuf");
+ } else {
+ m_hdr = m[i];
+ }
+ }
+
+ if (off != len)
+ GOTO_FAIL("Invalid segs");
+
+ packet_type = rte_net_get_ptype(m_hdr, &hdr_lens, RTE_PTYPE_ALL_MASK);
+ l3 = packet_type & RTE_PTYPE_L3_MASK;
+
+ l3_hdr = rte_pktmbuf_mtod_offset(m_hdr, void *, hdr_lens.l2_len);
+ off = hdr_lens.l2_len + hdr_lens.l3_len;
+
+ if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
+ if (rte_ipv4_udptcp_cksum_mbuf_verify(m_hdr, l3_hdr, off) != 0)
+ GOTO_FAIL("Invalid L4 checksum verification for multiple mbufs");
+ } else if (l3 == RTE_PTYPE_L3_IPV6 || l3 == RTE_PTYPE_L3_IPV6_EXT) {
+ if (rte_ipv6_udptcp_cksum_mbuf_verify(m_hdr, l3_hdr, off) != 0)
+ GOTO_FAIL("Invalid L4 checksum verification for multiple mbufs");
+ }
+
+ rte_pktmbuf_free_bulk(m, segs_len);
+
+ return 0;
+
+fail:
+ rte_pktmbuf_free_bulk(m, segs_len);
+
+ return -1;
+}
+
static int
test_cksum(void)
{
@@ -256,6 +352,12 @@ test_cksum(void)
sizeof(test_cksum_ipv4_opts_udp)) < 0)
GOTO_FAIL("checksum error on ipv4_opts_udp");
+ if (test_l4_cksum_multi_mbufs(pktmbuf_pool, test_cksum_ipv4_tcp_multi_segs,
+ sizeof(test_cksum_ipv4_tcp_multi_segs),
+ test_cksum_ipv4_tcp_multi_segs_len,
+ sizeof(test_cksum_ipv4_tcp_multi_segs_len)) < 0)
+ GOTO_FAIL("checksum error on multi mbufs check");
+
rte_mempool_free(pktmbuf_pool);
return 0;
diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
index a8e8927952..679ba82eb6 100644
--- a/lib/net/rte_cksum.h
+++ b/lib/net/rte_cksum.h
@@ -80,6 +80,25 @@ __rte_raw_cksum_reduce(uint32_t sum)
return (uint16_t)sum;
}
+/**
+ * @internal Reduce a sum to the non-complemented checksum.
+ * Helper routine for the rte_raw_cksum_mbuf().
+ *
+ * @param sum
+ * Value of the sum.
+ * @return
+ * The non-complemented checksum.
+ */
+static inline uint16_t
+__rte_raw_cksum_reduce_u64(uint64_t sum)
+{
+ uint32_t tmp;
+
+ tmp = __rte_raw_cksum_reduce((uint32_t)sum);
+ tmp += __rte_raw_cksum_reduce((uint32_t)(sum >> 32));
+ return __rte_raw_cksum_reduce(tmp);
+}
+
/**
* Process the non-complemented checksum of a buffer.
*
@@ -119,8 +138,8 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len,
{
const struct rte_mbuf *seg;
const char *buf;
- uint32_t sum, tmp;
- uint32_t seglen, done;
+ uint32_t seglen, done, tmp;
+ uint64_t sum;
/* easy case: all data in the first segment */
if (off + len <= rte_pktmbuf_data_len(m)) {
@@ -157,7 +176,7 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len,
for (;;) {
tmp = __rte_raw_cksum(buf, seglen, 0);
if (done & 1)
- tmp = rte_bswap16((uint16_t)tmp);
+ tmp = rte_bswap32(tmp);
sum += tmp;
done += seglen;
if (done == len)
@@ -169,7 +188,7 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m, uint32_t off, uint32_t len,
seglen = len - done;
}
- *cksum = __rte_raw_cksum_reduce(sum);
+ *cksum = __rte_raw_cksum_reduce_u64(sum);
return 0;
}
--
2.20.1
^ permalink raw reply related
* Re: [v3] net/cksum: compute raw cksum for several segments
From: su sai @ 2026-06-16 12:48 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: dev
In-Reply-To: <20260608100202.0deac83d@phoenix.local>
[-- Attachment #1: Type: text/plain, Size: 9570 bytes --]
Hi Stephen, I've revised the patch per your feedback and sent out v4:
net/cksum: compute raw cksum for several segments. Your review is
appreciated.
On Tue, Jun 9, 2026 at 1:02 AM Stephen Hemminger <stephen@networkplumber.org>
wrote:
> On Mon, 4 Aug 2025 11:54:30 +0800
> Su Sai <spiderdetective.ss@gmail.com> wrote:
>
> > The rte_raw_cksum_mbuf function is used to compute
> > the raw checksum of a packet.
> > If the packet payload stored in multi mbuf, the function
> > will goto the hard case. In hard case,
> > the variable 'tmp' is a type of uint32_t,
> > so rte_bswap16 will drop high 16 bit.
> > Meanwhile, the variable 'sum' is a type of uint32_t,
> > so 'sum += tmp' will drop the carry when overflow.
> > Both drop will make cksum incorrect.
> > This commit fixes the above bug.
> >
> > Signed-off-by: Su Sai <spiderdetective.ss@gmail.com>
> > ---
> > .mailmap | 1 +
> > app/test/test_cksum.c | 106 ++++++++++++++++++++++++++++++++++++++++++
> > lib/net/rte_cksum.h | 27 +++++++++--
> > 3 files changed, 130 insertions(+), 4 deletions(-)
> >
> > diff --git a/.mailmap b/.mailmap
> > index 34a99f93a1..1da1d9f8e1 100644
> > --- a/.mailmap
> > +++ b/.mailmap
> > @@ -1552,6 +1552,7 @@ Sunil Kumar Kori <skori@marvell.com> <
> sunil.kori@nxp.com>
> > Sunil Pai G <sunil.pai.g@intel.com>
> > Sunil Uttarwar <sunilprakashrao.uttarwar@amd.com>
> > Sun Jiajia <sunx.jiajia@intel.com>
> > +Su Sai <spiderdetective.ss@gmail.com> <susai.ss@bytedance.com>
> > Sunyang Wu <sunyang.wu@jaguarmicro.com>
> > Surabhi Boob <surabhi.boob@intel.com>
> > Suyang Ju <sju@paloaltonetworks.com>
> > diff --git a/app/test/test_cksum.c b/app/test/test_cksum.c
> > index f2ab5af5a7..fb2e3cf9e6 100644
> > --- a/app/test/test_cksum.c
> > +++ b/app/test/test_cksum.c
> > @@ -85,6 +85,42 @@ static const char test_cksum_ipv4_opts_udp[] = {
> > 0x00, 0x35, 0x00, 0x09, 0x89, 0x6f, 0x78,
> > };
> >
> > +/*
> > + * generated in scapy with
> > + * Ether()/IP()/TCP(options=[NOP,NOP,Timestamps])/os.urandom(113))
> > + */
> > +static const char test_cksum_ipv4_tcp_multi_segs[] = {
> > + 0x00, 0x16, 0x3e, 0x0b, 0x6b, 0xd2, 0xee, 0xff,
> > + 0xff, 0xff, 0xff, 0xff, 0x08, 0x00, 0x45, 0x00,
> > + 0x00, 0xa5, 0x46, 0x10, 0x40, 0x00, 0x40, 0x06,
> > + 0x80, 0xb5, 0xc0, 0xa8, 0xf9, 0x1d, 0xc0, 0xa8,
> > + 0xf9, 0x1e, 0xdc, 0xa2, 0x14, 0x51, 0xbb, 0x8f,
> > + 0xa0, 0x00, 0xe4, 0x7c, 0xe4, 0xb8, 0x80, 0x10,
> > + 0x02, 0x00, 0x4b, 0xc1, 0x00, 0x00, 0x01, 0x01,
> > + 0x08, 0x0a, 0x90, 0x60, 0xf4, 0xff, 0x03, 0xc5,
> > + 0xb4, 0x19, 0x77, 0x34, 0xd4, 0xdc, 0x84, 0x86,
> > + 0xff, 0x44, 0x09, 0x63, 0x36, 0x2e, 0x26, 0x9b,
> > + 0x90, 0x70, 0xf2, 0xed, 0xc8, 0x5b, 0x87, 0xaa,
> > + 0xb4, 0x67, 0x6b, 0x32, 0x3d, 0xc4, 0xbf, 0x15,
> > + 0xa9, 0x16, 0x6c, 0x2a, 0x9d, 0xb2, 0xb7, 0x6b,
> > + 0x58, 0x44, 0x58, 0x12, 0x4b, 0x8f, 0xe5, 0x12,
> > + 0x11, 0x90, 0x94, 0x68, 0x37, 0xad, 0x0a, 0x9b,
> > + 0xd6, 0x79, 0xf2, 0xb7, 0x31, 0xcf, 0x44, 0x22,
> > + 0xc8, 0x99, 0x3f, 0xe5, 0xe7, 0xac, 0xc7, 0x0b,
> > + 0x86, 0xdf, 0xda, 0xed, 0x0a, 0x0f, 0x86, 0xd7,
> > + 0x48, 0xe2, 0xf1, 0xc2, 0x43, 0xed, 0x47, 0x3a,
> > + 0xea, 0x25, 0x2d, 0xd6, 0x60, 0x38, 0x30, 0x07,
> > + 0x28, 0xdd, 0x1f, 0x0c, 0xdd, 0x7b, 0x7c, 0xd9,
> > + 0x35, 0x9d, 0x14, 0xaa, 0xc6, 0x35, 0xd1, 0x03,
> > + 0x38, 0xb1, 0xf5,
> > +};
> > +
> > +static const uint8_t test_cksum_ipv4_tcp_multi_segs_len[] = {
> > + 66, /* the first seg contains all headers, including L2 to L4 */
> > + 61, /* the second seg length is odd, test byte order independent
> */
> > + 52, /* three segs are sufficient to test the most complex
> scenarios */
> > +};
> > +
> > /* test l3/l4 checksum api */
> > static int
> > test_l4_cksum(struct rte_mempool *pktmbuf_pool, const char *pktdata,
> size_t len)
> > @@ -223,6 +259,70 @@ test_l4_cksum(struct rte_mempool *pktmbuf_pool,
> const char *pktdata, size_t len)
> > return -1;
> > }
> >
> > +/* test l4 checksum api for a packet with multiple mbufs */
> > +static int
> > +test_l4_cksum_multi_mbufs(struct rte_mempool *pktmbuf_pool, const char
> *pktdata, size_t len,
> > + const uint8_t *segs, size_t segs_len)
> > +{
> > + struct rte_mbuf *m[NB_MBUF] = {0};
> > + struct rte_mbuf *m_hdr = NULL;
> > + struct rte_net_hdr_lens hdr_lens;
> > + size_t i, off = 0;
> > + uint32_t packet_type, l3;
> > + void *l3_hdr;
> > + char *data;
> > +
> > + for (i = 0; i < segs_len; i++) {
> > + m[i] = rte_pktmbuf_alloc(pktmbuf_pool);
> > + if (m[i] == NULL)
> > + GOTO_FAIL("Cannot allocate mbuf");
> > +
> > + data = rte_pktmbuf_append(m[i], segs[i]);
> > + if (data == NULL)
> > + GOTO_FAIL("Cannot append data");
> > +
> > + rte_memcpy(data, pktdata + off, segs[i]);
>
> Tests (except rte_memcpy test) should not use rte_memcpy, instead use
> regular memcpy which has better coverage from analyzers.
>
> > + off += segs[i];
> > +
> > + if (m_hdr) {
> > + if (rte_pktmbuf_chain(m_hdr, m[i]))
> > + GOTO_FAIL("Cannot chain mbuf");
> > + } else {
> > + m_hdr = m[i];
> > + }
> > + }
> > +
> > + if (off != len)
> > + GOTO_FAIL("Invalid segs");
> > +
> > + packet_type = rte_net_get_ptype(m_hdr, &hdr_lens,
> RTE_PTYPE_ALL_MASK);
> > + l3 = packet_type & RTE_PTYPE_L3_MASK;
> > +
> > + l3_hdr = rte_pktmbuf_mtod_offset(m_hdr, void *, hdr_lens.l2_len);
> > + off = hdr_lens.l2_len + hdr_lens.l3_len;
> > +
> > + if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
> > + if (rte_ipv4_udptcp_cksum_mbuf_verify(m_hdr, l3_hdr, off)
> != 0)
> > + GOTO_FAIL("Invalid L4 checksum verification for
> multiple mbufs");
> > + } else if (l3 == RTE_PTYPE_L3_IPV6 || l3 == RTE_PTYPE_L3_IPV6_EXT)
> {
> > + if (rte_ipv6_udptcp_cksum_mbuf_verify(m_hdr, l3_hdr, off)
> != 0)
> > + GOTO_FAIL("Invalid L4 checksum verification for
> multiple mbufs");
> > + }
> > +
> > + for (i = 0; i < segs_len; i++)
> > + rte_pktmbuf_free(m[i]);
>
> Can avoid the loop here and elsewhere by using rte_pktmbuf_free_bulk()
>
> > + return 0;
> > +
> > +fail:
> > + for (i = 0; i < segs_len; i++) {
> > + if (m[i])
> > + rte_pktmbuf_free(m[i]);
> > + }
>
> Freebulk will work here
>
> > + return -1;
> > +}
> > +
> > static int
> > test_cksum(void)
> > {
> > @@ -256,6 +356,12 @@ test_cksum(void)
> > sizeof(test_cksum_ipv4_opts_udp)) < 0)
> > GOTO_FAIL("checksum error on ipv4_opts_udp");
> >
> > + if (test_l4_cksum_multi_mbufs(pktmbuf_pool,
> test_cksum_ipv4_tcp_multi_segs,
> > + sizeof(test_cksum_ipv4_tcp_multi_segs),
> > + test_cksum_ipv4_tcp_multi_segs_len,
> > + sizeof(test_cksum_ipv4_tcp_multi_segs_len)) < 0)
> > + GOTO_FAIL("checksum error on multi mbufs check");
> > +
> > rte_mempool_free(pktmbuf_pool);
> >
> > return 0;
> > diff --git a/lib/net/rte_cksum.h b/lib/net/rte_cksum.h
> > index a8e8927952..679ba82eb6 100644
> > --- a/lib/net/rte_cksum.h
> > +++ b/lib/net/rte_cksum.h
> > @@ -80,6 +80,25 @@ __rte_raw_cksum_reduce(uint32_t sum)
> > return (uint16_t)sum;
> > }
> >
> > +/**
> > + * @internal Reduce a sum to the non-complemented checksum.
> > + * Helper routine for the rte_raw_cksum_mbuf().
> > + *
> > + * @param sum
> > + * Value of the sum.
> > + * @return
> > + * The non-complemented checksum.
> > + */
> > +static inline uint16_t
> > +__rte_raw_cksum_reduce_u64(uint64_t sum)
> > +{
> > + uint32_t tmp;
> > +
> > + tmp = __rte_raw_cksum_reduce((uint32_t)sum);
> > + tmp += __rte_raw_cksum_reduce((uint32_t)(sum >> 32));
> > + return __rte_raw_cksum_reduce(tmp);
> > +}
> > +
> > /**
> > * Process the non-complemented checksum of a buffer.
> > *
> > @@ -119,8 +138,8 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m,
> uint32_t off, uint32_t len,
> > {
> > const struct rte_mbuf *seg;
> > const char *buf;
> > - uint32_t sum, tmp;
> > - uint32_t seglen, done;
> > + uint32_t seglen, done, tmp;
> > + uint64_t sum;
> >
> > /* easy case: all data in the first segment */
> > if (off + len <= rte_pktmbuf_data_len(m)) {
> > @@ -157,7 +176,7 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m,
> uint32_t off, uint32_t len,
> > for (;;) {
> > tmp = __rte_raw_cksum(buf, seglen, 0);
> > if (done & 1)
> > - tmp = rte_bswap16((uint16_t)tmp);
> > + tmp = rte_bswap32(tmp);
> > sum += tmp;
> > done += seglen;
> > if (done == len)
> > @@ -169,7 +188,7 @@ rte_raw_cksum_mbuf(const struct rte_mbuf *m,
> uint32_t off, uint32_t len,
> > seglen = len - done;
> > }
> >
> > - *cksum = __rte_raw_cksum_reduce(sum);
> > + *cksum = __rte_raw_cksum_reduce_u64(sum);
> > return 0;
> > }
> >
>
>
[-- Attachment #2: Type: text/html, Size: 12542 bytes --]
^ permalink raw reply
* [PATCH v2] app/testpmd: add VLAN priority insert support
From: Xingui Yang @ 2026-06-16 13:10 UTC (permalink / raw)
To: dev
Cc: stephen, david.marchand, aman.deep.singh, fengchengwen,
yangshuaisong, lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260612081411.2798403-1-yangxingui@huawei.com>
The tx_vlan set and tx_qinq set commands only accepted VLAN ID in range
[0, 4095]. This prevented users from setting 802.1p priority and CFI
bits when using hardware VLAN insertion.
Since mbuf vlan_tci field already supports full 16-bit VLAN Tag Control
Information (TCI), relax the validation for TX paths to allow priority
and CFI bits. The vlan_id parameter now accepts:
- Bits 0-11: VLAN ID (0-4095)
- Bit 12: CFI (Canonical Format Indicator)
- Bits 13-15: Priority (0-7, 802.1p CoS)
Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Suggested-by: fengchengwen <fengchengwen@huawei.com>
Signed-off-by: Xingui Yang <yangxingui@huawei.com>
---
v2:
- Removed --enable-vlan-priority option and global variable as suggested
by Stephen Hemminger. The feature is now always enabled for TX paths
- RX VLAN filter continues to enforce strict VLAN ID validation as
suggested by fengchengwen
- Added documentation updates for testpmd_funcs.rst and release notes
app/test-pmd/config.c | 13 ++++++++-----
doc/guides/rel_notes/release_26_07.rst | 7 +++++++
doc/guides/testpmd_app_ug/testpmd_funcs.rst | 17 ++++++++++++++---
3 files changed, 29 insertions(+), 8 deletions(-)
diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
index 9d457ca88e..38758f9c05 100644
--- a/app/test-pmd/config.c
+++ b/app/test-pmd/config.c
@@ -1241,8 +1241,11 @@ void print_valid_ports(void)
}
static int
-vlan_id_is_invalid(uint16_t vlan_id)
+vlan_id_is_invalid(uint16_t vlan_id, bool is_tx)
{
+ if (is_tx)
+ return 0;
+
if (vlan_id < 4096)
return 0;
fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
@@ -6876,7 +6879,7 @@ rx_vft_set(portid_t port_id, uint16_t vlan_id, int on)
if (port_id_is_invalid(port_id, ENABLED_WARN))
return 1;
- if (vlan_id_is_invalid(vlan_id))
+ if (vlan_id_is_invalid(vlan_id, false))
return 1;
diag = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
if (diag == 0)
@@ -6923,7 +6926,7 @@ tx_vlan_set(portid_t port_id, uint16_t vlan_id)
struct rte_eth_dev_info dev_info;
int ret;
- if (vlan_id_is_invalid(vlan_id))
+ if (vlan_id_is_invalid(vlan_id, true))
return;
if (ports[port_id].dev_conf.txmode.offloads &
@@ -6954,9 +6957,9 @@ tx_qinq_set(portid_t port_id, uint16_t vlan_id, uint16_t vlan_id_outer)
struct rte_eth_dev_info dev_info;
int ret;
- if (vlan_id_is_invalid(vlan_id))
+ if (vlan_id_is_invalid(vlan_id, true))
return;
- if (vlan_id_is_invalid(vlan_id_outer))
+ if (vlan_id_is_invalid(vlan_id_outer, true))
return;
ret = eth_dev_info_get_print_err(port_id, &dev_info);
diff --git a/doc/guides/rel_notes/release_26_07.rst b/doc/guides/rel_notes/release_26_07.rst
index 5d7aa8d1bf..e382c7f407 100644
--- a/doc/guides/rel_notes/release_26_07.rst
+++ b/doc/guides/rel_notes/release_26_07.rst
@@ -150,6 +150,13 @@ New Features
* Added ``eof`` devarg to use link state to signal end of receive file input.
* Added unit test suite.
+* **Added VLAN priority support in testpmd.**
+
+ Added support for setting VLAN priority and CFI bits in ``tx_vlan set``
+ and ``tx_qinq set`` commands. The ``vlan_tci`` parameter now accepts the
+ full 16-bit VLAN Tag Control Information (TCI) format, which includes
+ priority (bits 13-15), CFI (bit 12), and VLAN ID (bits 0-11).
+
* **Added AI review helpers.**
Added AGENTS.md file for AI review
diff --git a/doc/guides/testpmd_app_ug/testpmd_funcs.rst b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
index f0f2b0758b..b967810b10 100644
--- a/doc/guides/testpmd_app_ug/testpmd_funcs.rst
+++ b/doc/guides/testpmd_app_ug/testpmd_funcs.rst
@@ -1120,15 +1120,26 @@ tx_vlan set
Set hardware insertion of VLAN IDs in packets sent on a port::
- testpmd> tx_vlan set (port_id) vlan_id[, vlan_id_outer]
+ testpmd> tx_vlan set (port_id) vlan_tci[, vlan_tci_outer]
+
+The ``vlan_tci`` parameter accepts the full 16-bit VLAN Tag Control Information (TCI)
+format, which includes:
+
+* Bits 0-11: VLAN ID (0-4095)
+* Bit 12: CFI (Canonical Format Indicator)
+* Bits 13-15: Priority (0-7, 802.1p CoS)
For example, set a single VLAN ID (5) insertion on port 0::
- tx_vlan set 0 5
+ testpmd> tx_vlan set 0 5
+
+Or, set a VLAN ID with priority (priority=3, VLAN ID=6) insertion on port 0::
+
+ testpmd> tx_vlan set 0 0x6006
Or, set double VLAN ID (inner: 2, outer: 3) insertion on port 1::
- tx_vlan set 1 2 3
+ testpmd> tx_vlan set 1 2 3
tx_vlan set pvid
--
2.43.0
^ permalink raw reply related
* Re: [PATCH] app/testpmd: add VLAN priority insert support
From: yangxingui @ 2026-06-16 13:17 UTC (permalink / raw)
To: fengchengwen, dev
Cc: stephen, david.marchand, aman.deep.singh, yangshuaisong,
lihuisong, liuyonglong, kangfenglong
In-Reply-To: <7ead3834-afaa-4319-83bf-faf19b3ea3ef@huawei.com>
On 2026/6/15 17:46, fengchengwen wrote:
> On 6/12/2026 4:14 PM, Xingui Yang wrote:
>> The tx_vlan set command currently only accepts a VLAN ID in range
>> [0, 4095]. This patch adds support for an extended format that includes
>> 802.1p priority and CFI bits, allowing users to set the VLAN priority
>> tag when inserting VLAN headers in TX packets.
>>
>> The extended format is:
>> bit 0-11: VLAN ID (0-4095)
>> bit 12: CFI (Canonical Format Indicator)
>> bit 13-15: Priority (0-7, 802.1p CoS)
>>
>> This is consistent with the VLAN tag structure used by
>> rte_eth_dev_set_vlan_pvid() where the PVID field encodes VLAN ID, CFI
>> and priority in the same format.
>>
>> A new command line option --enable-vlan-priority is added to enable this
>> feature. By default, the feature is disabled to maintain backward
>> compatibility with existing users. When enabled, the
>> vlan_id_is_invalid() function allows any 16-bit value to pass, while the
>> full 16-bit value (including CFI and priority bits) is passed to the
>> driver for hardware VLAN insertion.
>>
>> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
>> ---
>> app/test-pmd/config.c | 24 +++++++++++++++---------
>> app/test-pmd/parameters.c | 6 ++++++
>> app/test-pmd/testpmd.c | 5 +++++
>> app/test-pmd/testpmd.h | 2 ++
>> 4 files changed, 28 insertions(+), 9 deletions(-)
>>
>> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
>> index 36b9b023e2..80cde109e6 100644
>> --- a/app/test-pmd/config.c
>> +++ b/app/test-pmd/config.c
>> @@ -1241,12 +1241,18 @@ void print_valid_ports(void)
>> }
>>
>> static int
>> -vlan_id_is_invalid(uint16_t vlan_id)
>> +vlan_id_is_invalid(uint16_t vlan_id, int vlan_priority_ena)
>> {
>> - if (vlan_id < 4096)
>> - return 0;
>> - fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
>> - return 1;
>> + if (!vlan_priority_ena && vlan_id >= 4096) {
>> + fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
>> + return 1;
>> + }
>> +
>> + /*
>> + * When vlan_priority_ena is enabled, allow any 16-bit value
>> + * to pass priority and CFI bits to the driver.
>> + */
>> + return 0;
>> }
>>
>> static uint32_t
>> @@ -6876,7 +6882,7 @@ rx_vft_set(portid_t port_id, uint16_t vlan_id, int on)
>>
>> if (port_id_is_invalid(port_id, ENABLED_WARN))
>> return 1;
>> - if (vlan_id_is_invalid(vlan_id))
>> + if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
>
> Just vlan_id_is_invalid(vlan_id, false) because Rx is no need to impl this.
>
>> return 1;
>> diag = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
>> if (diag == 0)
>> @@ -6923,7 +6929,7 @@ tx_vlan_set(portid_t port_id, uint16_t vlan_id)
>> struct rte_eth_dev_info dev_info;
>> int ret;
>>
>> - if (vlan_id_is_invalid(vlan_id))
>> + if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
>> return;
>>
>> if (ports[port_id].dev_conf.txmode.offloads &
>> @@ -6954,9 +6960,9 @@ tx_qinq_set(portid_t port_id, uint16_t vlan_id, uint16_t vlan_id_outer)
>> struct rte_eth_dev_info dev_info;
>> int ret;
>>
>> - if (vlan_id_is_invalid(vlan_id))
>> + if (vlan_id_is_invalid(vlan_id, vlan_priority_insert_ena))
>> return;
>> - if (vlan_id_is_invalid(vlan_id_outer))
>> + if (vlan_id_is_invalid(vlan_id_outer, vlan_priority_insert_ena))
>> return;
>>
>> ret = eth_dev_info_get_print_err(port_id, &dev_info);
>> diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c
>> index 8c3b1244e7..3f37498d3b 100644
>> --- a/app/test-pmd/parameters.c
>> +++ b/app/test-pmd/parameters.c
>> @@ -117,6 +117,8 @@ enum {
>> TESTPMD_OPT_ENABLE_HW_VLAN_EXTEND_NUM,
>> #define TESTPMD_OPT_ENABLE_HW_QINQ_STRIP "enable-hw-qinq-strip"
>> TESTPMD_OPT_ENABLE_HW_QINQ_STRIP_NUM,
>> +#define TESTPMD_OPT_ENABLE_VLAN_PRIORITY "enable-vlan-priority"
>> + TESTPMD_OPT_ENABLE_VLAN_PRIORITY_NUM,
>
> How about TESTPMD_OPT_ENABLE_VLAN_INSERT_PRI "enable-vlan-insert-pri"
I have adopted the simpler approach as suggested by Stephen.
>> #define TESTPMD_OPT_ENABLE_DROP_EN "enable-drop-en"
>> TESTPMD_OPT_ENABLE_DROP_EN_NUM,
>> #define TESTPMD_OPT_DISABLE_RSS "disable-rss"
>> @@ -461,6 +463,7 @@ usage(char* progname)
>> printf(" --enable-hw-vlan-strip: enable hardware vlan strip.\n");
>> printf(" --enable-hw-vlan-extend: enable hardware vlan extend.\n");
>> printf(" --enable-hw-qinq-strip: enable hardware qinq strip.\n");
>> + printf(" --enable-vlan-priority: enable vlan priority insert.\n");
>> printf(" --enable-drop-en: enable per queue packet drop.\n");
>> printf(" --disable-rss: disable rss.\n");
>> printf(" --enable-rss: Force rss even for single-queue operation.\n");
>> @@ -1259,6 +1262,9 @@ launch_args_parse(int argc, char** argv)
>> case TESTPMD_OPT_ENABLE_HW_QINQ_STRIP_NUM:
>> rx_offloads |= RTE_ETH_RX_OFFLOAD_QINQ_STRIP;
>> break;
>> + case TESTPMD_OPT_ENABLE_VLAN_PRIORITY_NUM:
>> + vlan_priority_insert_ena = 1;
>
> How about tx_insert_vlan_pri_en
>
>> + break;
I have adopted the simpler approach as suggested by Stephen, which
eliminates the need for a new command-line option and global variable.
>
> We need also update the testpmd document
OK.
Thanks,
Xingui
^ permalink raw reply
* Re: [PATCH] app/testpmd: add VLAN priority insert support
From: yangxingui @ 2026-06-16 13:19 UTC (permalink / raw)
To: Stephen Hemminger
Cc: dev, david.marchand, aman.deep.singh, fengchengwen, yangshuaisong,
lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260615121214.6fb7d8b7@phoenix.local>
On 2026/6/16 3:12, Stephen Hemminger wrote:
> On Fri, 12 Jun 2026 16:14:11 +0800
> Xingui Yang <yangxingui@huawei.com> wrote:
>
>> The tx_vlan set command currently only accepts a VLAN ID in range
>> [0, 4095]. This patch adds support for an extended format that includes
>> 802.1p priority and CFI bits, allowing users to set the VLAN priority
>> tag when inserting VLAN headers in TX packets.
>>
>> The extended format is:
>> bit 0-11: VLAN ID (0-4095)
>> bit 12: CFI (Canonical Format Indicator)
>> bit 13-15: Priority (0-7, 802.1p CoS)
>>
>> This is consistent with the VLAN tag structure used by
>> rte_eth_dev_set_vlan_pvid() where the PVID field encodes VLAN ID, CFI
>> and priority in the same format.
>>
>> A new command line option --enable-vlan-priority is added to enable this
>> feature. By default, the feature is disabled to maintain backward
>> compatibility with existing users. When enabled, the
>> vlan_id_is_invalid() function allows any 16-bit value to pass, while the
>> full 16-bit value (including CFI and priority bits) is passed to the
>> driver for hardware VLAN insertion.
>>
>> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
>> ---
>
>
> Having ability to set priority bits is good, and testpmd should allow it.
> The mbuf vlan_tci is already a full 16-bit TCI (priority/CFI/VID), and
> the TX insert path copies tx_vlan_id straight into it. So priority
> insert already works; the only thing in the way is the < 4096 check.
>
> Do you actually need a new option for this? Both of_push_vlan +
> of_set_vlan_pcp (rte_flow) and "tx_vlan set pvid" already let you set
> the priority bits today, with no new code.
>
> If you still want "tx_vlan set" itself to carry priority, I'd suggest
> a smaller change: relax only the TX insert validators and drop the
> option and the global. Don't touch rx_vft_set -- it feeds the VLAN
> filter, which only takes a VLAN ID and rejects > 4095 anyway, so the
> flag just turns a clear error into a confusing one.
>
> Either way, if the option stays, please document it, and add a release note.
> The commit message why the existing paths aren't enough.
Thank you for the suggestion. I have implemented the simpler approach in v2.
Thanks,
Xingui
^ permalink raw reply
* Re: [PATCH 2/2] ethdev: return 0 from dummy queue count
From: Stephen Hemminger @ 2026-06-16 14:07 UTC (permalink / raw)
To: Maxime Leroy
Cc: dev, stable, Thomas Monjalon, Andrew Rybchenko, Sunil Kumar Kori,
Morten Brørup
In-Reply-To: <20260616094259.686555-3-maxime@leroys.fr>
On Tue, 16 Jun 2026 11:42:58 +0200
Maxime Leroy <maxime@leroys.fr> wrote:
> The dummy rx_queue_count/tx_queue_count callback returned -ENOTSUP. On a
> port that is not started (freshly allocated, or stopped once the fast-path
> ops are reset to dummies) there are no packets queued, so the truthful
> answer is 0, not an error: querying the count is not an unsupported
> operation. This also matches the dummy Rx/Tx burst, which reports 0
> packets.
>
> A poll-mode worker checking rte_eth_rx_queue_count() across a concurrent
> port stop then sees an empty queue instead of a negative error.
>
> Fixes: 066f3d9cc21c ("ethdev: remove callback checks from fast path")
> Cc: stable@dpdk.org
>
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Signed-off-by: Maxime Leroy <maxime@leroys.fr>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
^ permalink raw reply
* Re: [PATCH v2 6/6] net/dpaa2: drop the fake software VLAN strip offload
From: Stephen Hemminger @ 2026-06-16 14:11 UTC (permalink / raw)
To: Maxime Leroy; +Cc: dev, Hemant Agrawal, Sachin Saxena
In-Reply-To: <20260616102727.708948-7-maxime@leroys.fr>
On Tue, 16 Jun 2026 12:27:26 +0200
Maxime Leroy <maxime@leroys.fr> wrote:
> RTE_ETH_RX_OFFLOAD_VLAN_STRIP is advertised, but no hardware VLAN strip
> backs it: when enabled, the Rx burst calls rte_vlan_strip() on every
> frame, a software op masquerading as a hardware offload.
>
> It saves a forwarding application nothing: the datapath reads the L2
> header anyway to classify or strip. The offload does not remove that
> read, it relocates it into the driver Rx burst, where it is far more
> expensive.
>
> The cost is a matter of timing. rte_vlan_strip() reaches the L2 header
> through rte_pktmbuf_mtod(), which dereferences mbuf->buf_addr. On a
> freshly recycled buffer that mbuf cacheline is cold. eth_fd_to_mbuf()
> has just written other fields of it (data_off, ol_flags), but buf_addr
> is a persistent field it does not rewrite. A write does not stall: it
> posts to the store buffer while the line fills in the background, and
> the rewritten fields are forwarded straight from there. buf_addr has
> nothing to forward, so it must be read from the line, whose fill is
> still in flight, and the read stalls. The ethertype read that follows,
> on the cold payload line, stalls again. Read later by the application,
> when the fill has completed, the same read hits. The offload just
> performs it at the worst possible moment.
>
> Measured on a single-core port-to-port forwarding test over two 10G
> ports (one core at 2 GHz, 64-byte untagged frames):
>
> - throughput 4.22 -> 5.00 Mpps (+18 percent)
> - IPC 0.93 -> 1.25: the cost was memory stall, not compute
> - L3/DRAM-bound L2 refills 319M -> 200M over 10s (-37 percent)
>
> perf confirms it: with the offload, the buf_addr load (the cold mbuf
> field) and the payload load account for about 84 percent of the Rx
> burst's L2 refills; removing it, those vanish and only the inherent DQRR
> dequeue misses remain.
>
> Stop advertising VLAN_STRIP and remove the rte_vlan_strip() calls from
> every Rx path. This is a behavioural change: the tag is left in the
> frame, so an application must strip it itself, on the L2 header it
> already reads.
>
> Signed-off-by: Maxime Leroy <maxime@leroys.fr>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
^ permalink raw reply
* Re: [PATCH v2] app/testpmd: add VLAN priority insert support
From: Stephen Hemminger @ 2026-06-16 14:23 UTC (permalink / raw)
To: Xingui Yang
Cc: dev, david.marchand, aman.deep.singh, fengchengwen, yangshuaisong,
lihuisong, liuyonglong, kangfenglong
In-Reply-To: <20260616131001.2955655-1-yangxingui@huawei.com>
On Tue, 16 Jun 2026 21:10:01 +0800
Xingui Yang <yangxingui@huawei.com> wrote:
> The tx_vlan set and tx_qinq set commands only accepted VLAN ID in range
> [0, 4095]. This prevented users from setting 802.1p priority and CFI
> bits when using hardware VLAN insertion.
>
> Since mbuf vlan_tci field already supports full 16-bit VLAN Tag Control
> Information (TCI), relax the validation for TX paths to allow priority
> and CFI bits. The vlan_id parameter now accepts:
> - Bits 0-11: VLAN ID (0-4095)
> - Bit 12: CFI (Canonical Format Indicator)
> - Bits 13-15: Priority (0-7, 802.1p CoS)
>
> Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
> Suggested-by: fengchengwen <fengchengwen@huawei.com>
> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
> ---
> v2:
> - Removed --enable-vlan-priority option and global variable as suggested
> by Stephen Hemminger. The feature is now always enabled for TX paths
> - RX VLAN filter continues to enforce strict VLAN ID validation as
> suggested by fengchengwen
> - Added documentation updates for testpmd_funcs.rst and release notes
>
> app/test-pmd/config.c | 13 ++++++++-----
> doc/guides/rel_notes/release_26_07.rst | 7 +++++++
> doc/guides/testpmd_app_ug/testpmd_funcs.rst | 17 ++++++++++++++---
> 3 files changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c
> index 9d457ca88e..38758f9c05 100644
> --- a/app/test-pmd/config.c
> +++ b/app/test-pmd/config.c
> @@ -1241,8 +1241,11 @@ void print_valid_ports(void)
> }
>
> static int
> -vlan_id_is_invalid(uint16_t vlan_id)
> +vlan_id_is_invalid(uint16_t vlan_id, bool is_tx)
> {
> + if (is_tx)
> + return 0;
> +
> if (vlan_id < 4096)
> return 0;
> fprintf(stderr, "Invalid vlan_id %d (must be < 4096)\n", vlan_id);
> @@ -6876,7 +6879,7 @@ rx_vft_set(portid_t port_id, uint16_t vlan_id, int on)
>
> if (port_id_is_invalid(port_id, ENABLED_WARN))
> return 1;
> - if (vlan_id_is_invalid(vlan_id))
> + if (vlan_id_is_invalid(vlan_id, false))
> return 1;
> diag = rte_eth_dev_vlan_filter(port_id, vlan_id, on);
> if (diag == 0)
> @@ -6923,7 +6926,7 @@ tx_vlan_set(portid_t port_id, uint16_t vlan_id)
> struct rte_eth_dev_info dev_info;
> int ret;
>
> - if (vlan_id_is_invalid(vlan_id))
> + if (vlan_id_is_invalid(vlan_id, true))
> return;
Why have the is_tx flag if it is always used as constant?
Just remove the whole vlan_id_is_invalid() branch test in the transmit path.
Maybe add a comment that any VLAN is allowed on transmit?
Or make a new function. Since VLAN of 0xffff is reserved. Though you might want
to allow it since testpmd is for testing even invalid packets.
^ permalink raw reply
* Re: [RFC 1/4] telemetry: allow commands to receive file descriptors
From: Stephen Hemminger @ 2026-06-16 14:26 UTC (permalink / raw)
To: Bruce Richardson; +Cc: dev
In-Reply-To: <ajFCXCZfikPJTrLH@bricha3-mobl1.ger.corp.intel.com>
On Tue, 16 Jun 2026 13:32:28 +0100
Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Tue, Jun 09, 2026 at 02:02:02PM -0700, Stephen Hemminger wrote:
> > Add rte_telemetry_register_cmd_fd_arg() to register a command whose
> > callback also receives file descriptors passed by the client as
> > SCM_RIGHTS ancillary data. The callback owns the descriptors and must
> > close them.
> >
> > This lets a client open a file itself and hand the descriptor to the
> > primary process, so DPDK never opens the path. That avoids path and
> > permission problems and works across container filesystem namespaces.
> >
> > Existing commands and clients are unaffected. If unsolicited file
> > descriptor is passed, it is closed.
> >
>
> This scheme seems reasonable in general. My only concern is whether the
> lack of potential windows support is an issue? For regular telemetry, there
> was always the option of a windows implementation using regular
> TCP/UDP/SCTP sockets bound to localhost. However, AFAIK there is no windows
> implementation of anything that supports file descriptors or handles
> between processes.
>
> Some other pieces of feedback inline below.
>
> /Bruce
I have new version (testing) that passes filename as parameter.
That should work without the fd passing.
^ permalink raw reply
* Re: [RFC 0/4] alternative capture mechanism
From: Stephen Hemminger @ 2026-06-16 14:28 UTC (permalink / raw)
To: Bruce Richardson; +Cc: dev
In-Reply-To: <ajFDjhDc5hXhE8km@bricha3-mobl1.ger.corp.intel.com>
On Tue, 16 Jun 2026 13:37:34 +0100
Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Tue, Jun 09, 2026 at 02:02:01PM -0700, Stephen Hemminger wrote:
> > This is an RFC for an alternative way to capture packets from a DPDK
> > application. I did brief demo of similar mechanism at DPDK summit but
> > this is more complete. Capture runs in the primary process and is driven
> > entirely over telemetry; no secondary process is involved.
> >
> > A client asks the application to start capturing and passes it a file
> > descriptor to write to. The application writes pcapng to that descriptor.
> > A Wireshark extcap script is the intended front end, but the control path
> > is just telemetry and the output is just a pipe, so other front ends are
> > possible.
> >
> > 1/4 telemetry: let a command receive file descriptors from the client
> > 2/4 capture: the library
> > 3/4 test: functional test
> > 4/4 app: the Wireshark extcap script and its documentation
> >
> > Setup and usage are in doc/guides/tools/wireshark_extcap.rst.
> >
> > Primary process only for now; secondary-process capture is possible as
> > follow-on. Posting as RFC to get feedback on the approach.
> >
> > The extcap script is dual licensed (BSD-3-Clause OR GPL-2.0-or-later) as
> > it may be more useful in the Wireshark tree.
> >
>
> One concern I have though - does this cause system-calls to be made in the
> fast-path because we are writting to a passed in FD? For performance
> reasons, would it not be better to use a memory buffer for this, thereby
> avoiding syscalls? For example, rather than passing in an FD to telemetry,
> we could pass in a key to be passed to shmget (going old-school!), or
> name parameter for shm_open. Thereafter with the memory buffer we can use a
> circular ring or similar to pass the data from app to client.
>
> /Bruce
The system calls are contained inside the thread spawned when capture starts.
The flow is:
callback -> ring -> capture thread -> FIFO -> wireshark
^ permalink raw reply
* RE: [PATCH v2 6/6] net/dpaa2: drop the fake software VLAN strip offload
From: Hemant Agrawal @ 2026-06-16 15:40 UTC (permalink / raw)
To: Stephen Hemminger, Maxime Leroy; +Cc: dev@dpdk.org, Sachin Saxena
In-Reply-To: <20260616071110.1baf1beb@phoenix.local>
> -----Original Message-----
> From: Stephen Hemminger <stephen@networkplumber.org>
> Sent: 16 June 2026 19:41
> To: Maxime Leroy <maxime@leroys.fr>
> Cc: dev@dpdk.org; Hemant Agrawal <hemant.agrawal@nxp.com>; Sachin
> Saxena <sachin.saxena@nxp.com>
> Subject: Re: [PATCH v2 6/6] net/dpaa2: drop the fake software VLAN strip
> offload
> Importance: High
>
> On Tue, 16 Jun 2026 12:27:26 +0200
> Maxime Leroy <maxime@leroys.fr> wrote:
>
> > RTE_ETH_RX_OFFLOAD_VLAN_STRIP is advertised, but no hardware VLAN
> > strip backs it: when enabled, the Rx burst calls rte_vlan_strip() on
> > every frame, a software op masquerading as a hardware offload.
> >
> > It saves a forwarding application nothing: the datapath reads the L2
> > header anyway to classify or strip. The offload does not remove that
> > read, it relocates it into the driver Rx burst, where it is far more
> > expensive.
> >
> > The cost is a matter of timing. rte_vlan_strip() reaches the L2 header
> > through rte_pktmbuf_mtod(), which dereferences mbuf->buf_addr. On a
> > freshly recycled buffer that mbuf cacheline is cold. eth_fd_to_mbuf()
> > has just written other fields of it (data_off, ol_flags), but buf_addr
> > is a persistent field it does not rewrite. A write does not stall: it
> > posts to the store buffer while the line fills in the background, and
> > the rewritten fields are forwarded straight from there. buf_addr has
> > nothing to forward, so it must be read from the line, whose fill is
> > still in flight, and the read stalls. The ethertype read that follows,
> > on the cold payload line, stalls again. Read later by the application,
> > when the fill has completed, the same read hits. The offload just
> > performs it at the worst possible moment.
> >
> > Measured on a single-core port-to-port forwarding test over two 10G
> > ports (one core at 2 GHz, 64-byte untagged frames):
> >
> > - throughput 4.22 -> 5.00 Mpps (+18 percent)
> > - IPC 0.93 -> 1.25: the cost was memory stall, not compute
> > - L3/DRAM-bound L2 refills 319M -> 200M over 10s (-37 percent)
> >
> > perf confirms it: with the offload, the buf_addr load (the cold mbuf
> > field) and the payload load account for about 84 percent of the Rx
> > burst's L2 refills; removing it, those vanish and only the inherent
> > DQRR dequeue misses remain.
> >
> > Stop advertising VLAN_STRIP and remove the rte_vlan_strip() calls from
> > every Rx path. This is a behavioural change: the tag is left in the
> > frame, so an application must strip it itself, on the L2 header it
> > already reads.
> >
> > Signed-off-by: Maxime Leroy <maxime@leroys.fr>
>
> Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Acked-by: Hemant Agrawal <hemant.agrawal@nxp.com>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox