* [PATCH net-next 1/5] net/mlx5: Refactor EEPROM query error handling to return status separately
2025-11-17 21:42 [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 Tariq Toukan
@ 2025-11-17 21:42 ` Tariq Toukan
2025-11-19 23:11 ` Matthew W Carlis
2025-11-17 21:42 ` [PATCH net-next 2/5] net/mlx5e: Recover SQ on excessive PTP TX timestamp delta Tariq Toukan
` (4 subsequent siblings)
5 siblings, 1 reply; 9+ messages in thread
From: Tariq Toukan @ 2025-11-17 21:42 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Richard Cochran, netdev, linux-rdma, linux-kernel, Gal Pressman,
Moshe Shemesh, Dragos Tatulea, Matthew W Carlis
From: Gal Pressman <gal@nvidia.com>
Matthew and Jakub reported [1] issues where inventory automation tools
are calling EEPROM query repeatedly on a port that doesn't have an SFP
connected, resulting in millions of error prints.
Move MCIA register status extraction from the query functions to the
callers, allowing use of extack reporting instead of a dmesg print when
using the netlink API.
[1] https://lore.kernel.org/netdev/20251028194011.39877-1-mattc@purestorage.com/
Cc: Matthew W Carlis <mattc@purestorage.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../ethernet/mellanox/mlx5/core/en_ethtool.c | 19 +++++-----
.../ethernet/mellanox/mlx5/core/mlx5_core.h | 4 +--
.../net/ethernet/mellanox/mlx5/core/port.c | 35 +++++++++----------
3 files changed, 30 insertions(+), 28 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 01b8f05a23db..7cf2ec8543f6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -2027,7 +2027,7 @@ static int mlx5e_get_module_info(struct net_device *netdev,
int size_read = 0;
u8 data[4] = {0};
- size_read = mlx5_query_module_eeprom(dev, 0, 2, data);
+ size_read = mlx5_query_module_eeprom(dev, 0, 2, data, NULL);
if (size_read < 2)
return -EIO;
@@ -2069,6 +2069,7 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev,
struct mlx5_core_dev *mdev = priv->mdev;
int offset = ee->offset;
int size_read;
+ u8 status = 0;
int i = 0;
if (!ee->len)
@@ -2078,15 +2079,15 @@ static int mlx5e_get_module_eeprom(struct net_device *netdev,
while (i < ee->len) {
size_read = mlx5_query_module_eeprom(mdev, offset, ee->len - i,
- data + i);
-
+ data + i, &status);
if (!size_read)
/* Done reading */
return 0;
if (size_read < 0) {
- netdev_err(priv->netdev, "%s: mlx5_query_eeprom failed:0x%x\n",
- __func__, size_read);
+ netdev_err(netdev,
+ "%s: mlx5_query_eeprom failed:0x%x, status %u\n",
+ __func__, size_read, status);
return size_read;
}
@@ -2106,6 +2107,7 @@ static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev,
struct mlx5_core_dev *mdev = priv->mdev;
u8 *data = page_data->data;
int size_read;
+ u8 status = 0;
int i = 0;
if (!page_data->length)
@@ -2119,7 +2121,8 @@ static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev,
query.page = page_data->page;
while (i < page_data->length) {
query.size = page_data->length - i;
- size_read = mlx5_query_module_eeprom_by_page(mdev, &query, data + i);
+ size_read = mlx5_query_module_eeprom_by_page(mdev, &query,
+ data + i, &status);
/* Done reading, return how many bytes was read */
if (!size_read)
@@ -2128,8 +2131,8 @@ static int mlx5e_get_module_eeprom_by_page(struct net_device *netdev,
if (size_read < 0) {
NL_SET_ERR_MSG_FMT_MOD(
extack,
- "Query module eeprom by page failed, read %u bytes, err %d",
- i, size_read);
+ "Query module eeprom by page failed, read %u bytes, err %d, status %u",
+ i, size_read, status);
return size_read;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index acef7d0ffa09..cfebc110c02f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -357,11 +357,11 @@ int mlx5_set_port_fcs(struct mlx5_core_dev *mdev, u8 enable);
void mlx5_query_port_fcs(struct mlx5_core_dev *mdev, bool *supported,
bool *enabled);
int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
- u16 offset, u16 size, u8 *data);
+ u16 offset, u16 size, u8 *data, u8 *status);
int
mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev,
struct mlx5_module_eeprom_query_params *params,
- u8 *data);
+ u8 *data, u8 *status);
int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out);
int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
index aa9f2b0a77d3..e4b1dfafb41f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -289,11 +289,11 @@ int mlx5_query_module_num(struct mlx5_core_dev *dev, int *module_num)
}
static int mlx5_query_module_id(struct mlx5_core_dev *dev, int module_num,
- u8 *module_id)
+ u8 *module_id, u8 *status)
{
u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {};
u32 out[MLX5_ST_SZ_DW(mcia_reg)];
- int err, status;
+ int err;
u8 *ptr;
MLX5_SET(mcia_reg, in, i2c_device_address, MLX5_I2C_ADDR_LOW);
@@ -308,12 +308,12 @@ static int mlx5_query_module_id(struct mlx5_core_dev *dev, int module_num,
if (err)
return err;
- status = MLX5_GET(mcia_reg, out, status);
- if (status) {
- mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
- status);
+ if (MLX5_GET(mcia_reg, out, status)) {
+ if (status)
+ *status = MLX5_GET(mcia_reg, out, status);
return -EIO;
}
+
ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
*module_id = ptr[0];
@@ -370,13 +370,14 @@ static int mlx5_mcia_max_bytes(struct mlx5_core_dev *dev)
}
static int mlx5_query_mcia(struct mlx5_core_dev *dev,
- struct mlx5_module_eeprom_query_params *params, u8 *data)
+ struct mlx5_module_eeprom_query_params *params,
+ u8 *data, u8 *status)
{
u32 in[MLX5_ST_SZ_DW(mcia_reg)] = {};
u32 out[MLX5_ST_SZ_DW(mcia_reg)];
- int status, err;
void *ptr;
u16 size;
+ int err;
size = min_t(int, params->size, mlx5_mcia_max_bytes(dev));
@@ -392,12 +393,9 @@ static int mlx5_query_mcia(struct mlx5_core_dev *dev,
if (err)
return err;
- status = MLX5_GET(mcia_reg, out, status);
- if (status) {
- mlx5_core_err(dev, "query_mcia_reg failed: status: 0x%x\n",
- status);
+ *status = MLX5_GET(mcia_reg, out, status);
+ if (*status)
return -EIO;
- }
ptr = MLX5_ADDR_OF(mcia_reg, out, dword_0);
memcpy(data, ptr, size);
@@ -406,7 +404,7 @@ static int mlx5_query_mcia(struct mlx5_core_dev *dev,
}
int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
- u16 offset, u16 size, u8 *data)
+ u16 offset, u16 size, u8 *data, u8 *status)
{
struct mlx5_module_eeprom_query_params query = {0};
u8 module_id;
@@ -416,7 +414,8 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
if (err)
return err;
- err = mlx5_query_module_id(dev, query.module_number, &module_id);
+ err = mlx5_query_module_id(dev, query.module_number, &module_id,
+ status);
if (err)
return err;
@@ -441,12 +440,12 @@ int mlx5_query_module_eeprom(struct mlx5_core_dev *dev,
query.size = size;
query.offset = offset;
- return mlx5_query_mcia(dev, &query, data);
+ return mlx5_query_mcia(dev, &query, data, status);
}
int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev,
struct mlx5_module_eeprom_query_params *params,
- u8 *data)
+ u8 *data, u8 *status)
{
int err;
@@ -460,7 +459,7 @@ int mlx5_query_module_eeprom_by_page(struct mlx5_core_dev *dev,
return -EINVAL;
}
- return mlx5_query_mcia(dev, params, data);
+ return mlx5_query_mcia(dev, params, data, status);
}
static int mlx5_query_port_pvlc(struct mlx5_core_dev *dev, u32 *pvlc,
--
2.31.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH net-next 1/5] net/mlx5: Refactor EEPROM query error handling to return status separately
2025-11-17 21:42 ` [PATCH net-next 1/5] net/mlx5: Refactor EEPROM query error handling to return status separately Tariq Toukan
@ 2025-11-19 23:11 ` Matthew W Carlis
2025-11-23 7:11 ` Gal Pressman
0 siblings, 1 reply; 9+ messages in thread
From: Matthew W Carlis @ 2025-11-19 23:11 UTC (permalink / raw)
To: tariqt
Cc: andrew+netdev, davem, dtatulea, edumazet, gal, kuba, leon,
linux-kernel, linux-rdma, mattc, mbloch, moshe, netdev, pabeni,
richardcochran, saeedm
On Mon, 17 Nov 2025 23:42:05 +0200, Gal Pressman said
> Matthew and Jakub reported [1] issues where inventory automation tools
> are calling EEPROM query repeatedly on a port that doesn't have an SFP
> connected, resulting in millions of error prints.
I'm not very familiar with the networking stack in general, but I poked around a
little trying to be able to come up with a meaningful review. I noticed that in
ethtool there are two methods registered for "ethtool -m".. Looks like it first
prefers a netlink method, but also may fall back on an ioctl implementation.
Will users who end up in the ioctl path expect to see the kernel message? In the
case of users who run "ethtool -m" on a device without a transceiver installed I
think we should expect to see something as follows? (Is this correct?)
$ ethtool -m ethx
"netlink error: mlx5: Query module eeprom by page failed, read xxx bytes, err xxx, status xxx"
Thank you for helping on this issue.
-Matt
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH net-next 1/5] net/mlx5: Refactor EEPROM query error handling to return status separately
2025-11-19 23:11 ` Matthew W Carlis
@ 2025-11-23 7:11 ` Gal Pressman
0 siblings, 0 replies; 9+ messages in thread
From: Gal Pressman @ 2025-11-23 7:11 UTC (permalink / raw)
To: Matthew W Carlis, tariqt
Cc: andrew+netdev, davem, dtatulea, edumazet, kuba, leon,
linux-kernel, linux-rdma, mbloch, moshe, netdev, pabeni,
richardcochran, saeedm
On 20/11/2025 1:11, Matthew W Carlis wrote:
> On Mon, 17 Nov 2025 23:42:05 +0200, Gal Pressman said
>
>> Matthew and Jakub reported [1] issues where inventory automation tools
>> are calling EEPROM query repeatedly on a port that doesn't have an SFP
>> connected, resulting in millions of error prints.
>
> I'm not very familiar with the networking stack in general, but I poked around a
> little trying to be able to come up with a meaningful review. I noticed that in
> ethtool there are two methods registered for "ethtool -m".. Looks like it first
> prefers a netlink method, but also may fall back on an ioctl implementation.
>
> Will users who end up in the ioctl path expect to see the kernel message? In the
> case of users who run "ethtool -m" on a device without a transceiver installed I
> think we should expect to see something as follows? (Is this correct?)
>
> $ ethtool -m ethx
> "netlink error: mlx5: Query module eeprom by page failed, read xxx bytes, err xxx, status xxx"
>
>
> Thank you for helping on this issue.
> -Matt
The ioctl is the "legacy" flow, maintained for backwards compatibility.
The mechanism to pass error messages to userspace (extack) does not
exist in ioctls, so the output you mentioned is relevant for the netlink
case.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH net-next 2/5] net/mlx5e: Recover SQ on excessive PTP TX timestamp delta
2025-11-17 21:42 [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 Tariq Toukan
2025-11-17 21:42 ` [PATCH net-next 1/5] net/mlx5: Refactor EEPROM query error handling to return status separately Tariq Toukan
@ 2025-11-17 21:42 ` Tariq Toukan
2025-11-17 21:42 ` [PATCH net-next 3/5] net/mlx5: Remove redundant bw_share minimal value assignment Tariq Toukan
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Tariq Toukan @ 2025-11-17 21:42 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Richard Cochran, netdev, linux-rdma, linux-kernel, Gal Pressman,
Moshe Shemesh, Dragos Tatulea, Carolina Jubran
From: Carolina Jubran <cjubran@nvidia.com>
Extend the TX timestamp handler to recover the SQ when the difference
between the port and CQE TX timestamps is abnormally large.
The current logic aborts timestamp delivery if the delta exceeds
1/128 seconds, which matches the maximum expected packet interval in
ptp4l. A larger delta makes the timestamps unreliable.
This change adds recovery if the delta exceeds 0.5 seconds. Such a
large gap should not occur in normal operation and indicates that
firmware is stuck or metadata tracking is out of sync, leading to stale
or mismatched timestamps. Recovering the SQ ensures forward progress
and avoids silently dropping invalid timestamps.
The timestamp handler now takes mlx5e_ptpsq directly to access both CQ
stats and the recovery state.
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Shahar Shitrit <shshitrit@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/en/ptp.c | 21 +++++++++++++------
.../net/ethernet/mellanox/mlx5/core/en/ptp.h | 2 +-
.../net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +-
3 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index 12e10feb30f0..424f8a2728a3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -82,7 +82,7 @@ static struct mlx5e_skb_cb_hwtstamp *mlx5e_skb_cb_get_hwts(struct sk_buff *skb)
}
static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb,
- struct mlx5e_ptp_cq_stats *cq_stats)
+ struct mlx5e_ptpsq *ptpsq)
{
struct skb_shared_hwtstamps hwts = {};
ktime_t diff;
@@ -92,8 +92,17 @@ static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb,
/* Maximal allowed diff is 1 / 128 second */
if (diff > (NSEC_PER_SEC >> 7)) {
- cq_stats->abort++;
- cq_stats->abort_abs_diff_ns += diff;
+ struct mlx5e_txqsq *sq = &ptpsq->txqsq;
+
+ ptpsq->cq_stats->abort++;
+ ptpsq->cq_stats->abort_abs_diff_ns += diff;
+ if (diff > (NSEC_PER_SEC >> 1) &&
+ !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
+ netdev_warn(sq->channel->netdev,
+ "PTP TX timestamp difference between CQE and port exceeds threshold: %lld ns, recovering SQ %u\n",
+ (s64)diff, sq->sqn);
+ queue_work(sq->priv->wq, &ptpsq->report_unhealthy_work);
+ }
return;
}
@@ -103,7 +112,7 @@ static void mlx5e_skb_cb_hwtstamp_tx(struct sk_buff *skb,
void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
ktime_t hwtstamp,
- struct mlx5e_ptp_cq_stats *cq_stats)
+ struct mlx5e_ptpsq *ptpsq)
{
switch (hwtstamp_type) {
case (MLX5E_SKB_CB_CQE_HWTSTAMP):
@@ -121,7 +130,7 @@ void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
!mlx5e_skb_cb_get_hwts(skb)->port_hwtstamp)
return;
- mlx5e_skb_cb_hwtstamp_tx(skb, cq_stats);
+ mlx5e_skb_cb_hwtstamp_tx(skb, ptpsq);
memset(skb->cb, 0, sizeof(struct mlx5e_skb_cb_hwtstamp));
}
@@ -209,7 +218,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, get_cqe_ts(cqe));
mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_PORT_HWTSTAMP,
- hwtstamp, ptpsq->cq_stats);
+ hwtstamp, ptpsq);
ptpsq->cq_stats->cqe++;
mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
index 1c0e0a86a9ac..2a457a2ed707 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h
@@ -147,7 +147,7 @@ enum {
void mlx5e_skb_cb_hwtstamp_handler(struct sk_buff *skb, int hwtstamp_type,
ktime_t hwtstamp,
- struct mlx5e_ptp_cq_stats *cq_stats);
+ struct mlx5e_ptpsq *ptpsq);
void mlx5e_skb_cb_hwtstamp_init(struct sk_buff *skb);
#endif /* __MLX5_EN_PTP_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index 2702b3885f06..14884b9ea7f3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -755,7 +755,7 @@ static void mlx5e_consume_skb(struct mlx5e_txqsq *sq, struct sk_buff *skb,
hwts.hwtstamp = mlx5e_cqe_ts_to_ns(sq->ptp_cyc2time, sq->clock, ts);
if (sq->ptpsq) {
mlx5e_skb_cb_hwtstamp_handler(skb, MLX5E_SKB_CB_CQE_HWTSTAMP,
- hwts.hwtstamp, sq->ptpsq->cq_stats);
+ hwts.hwtstamp, sq->ptpsq);
} else {
skb_tstamp_tx(skb, &hwts);
sq->stats->timestamps++;
--
2.31.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 3/5] net/mlx5: Remove redundant bw_share minimal value assignment
2025-11-17 21:42 [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 Tariq Toukan
2025-11-17 21:42 ` [PATCH net-next 1/5] net/mlx5: Refactor EEPROM query error handling to return status separately Tariq Toukan
2025-11-17 21:42 ` [PATCH net-next 2/5] net/mlx5e: Recover SQ on excessive PTP TX timestamp delta Tariq Toukan
@ 2025-11-17 21:42 ` Tariq Toukan
2025-11-17 21:42 ` [PATCH net-next 4/5] net/mlx5: Abort new commands if all command slots are stalled Tariq Toukan
` (2 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Tariq Toukan @ 2025-11-17 21:42 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Richard Cochran, netdev, linux-rdma, linux-kernel, Gal Pressman,
Moshe Shemesh, Dragos Tatulea, Carolina Jubran
From: Carolina Jubran <cjubran@nvidia.com>
Remove unnecessary logic that sets bw_share to minimal value, when
parent has bw_share configured but nodes don't have min_rate.
This check is redundant because the parent bandwidth acts as the upper
bound regardless, and the firmware always enforces the topmost
bandwidth constraint.
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 7 -------
1 file changed, 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
index 56e6f54b1e2e..4278bcb04c72 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c
@@ -341,13 +341,6 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
if (max_guarantee)
return max_t(u32, max_guarantee / fw_max_bw_share, 1);
- /* If nodes max min_rate divider is 0 but their parent has bw_share
- * configured, then set bw_share for nodes to minimal value.
- */
-
- if (parent && parent->bw_share)
- return 1;
-
/* If the node nodes has min_rate configured, a divider of 0 sets all
* nodes' bw_share to 0, effectively disabling min guarantees.
*/
--
2.31.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 4/5] net/mlx5: Abort new commands if all command slots are stalled
2025-11-17 21:42 [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 Tariq Toukan
` (2 preceding siblings ...)
2025-11-17 21:42 ` [PATCH net-next 3/5] net/mlx5: Remove redundant bw_share minimal value assignment Tariq Toukan
@ 2025-11-17 21:42 ` Tariq Toukan
2025-11-17 21:42 ` [PATCH net-next 5/5] net/mlx5: Use EOPNOTSUPP instead of ENOTSUPP Tariq Toukan
2025-11-19 3:20 ` [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 patchwork-bot+netdevbpf
5 siblings, 0 replies; 9+ messages in thread
From: Tariq Toukan @ 2025-11-17 21:42 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Richard Cochran, netdev, linux-rdma, linux-kernel, Gal Pressman,
Moshe Shemesh, Dragos Tatulea
From: Saeed Mahameed <saeedm@nvidia.com>
In case of a FW issue, FW might be not responding to FW commands,
causing kernel lockout for a long period of time, e.g. rtnl_lock held
while ethtool is trying to collect stats waiting for FW to respond to
multiple commands, when all of them will timeout.
While there's no immediate indication of the FW lockout, we can safely
assume that something is wrong when all command slots are busy and in
a timeout state and no FW completion was received on any of them.
In such case, start immediately failing new commands.
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 55 +++++++++++++++++++
include/linux/mlx5/driver.h | 1 +
2 files changed, 56 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 722282cebce9..5b08e5ffe0e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -181,6 +181,7 @@ static int cmd_alloc_index(struct mlx5_cmd *cmd, struct mlx5_cmd_work_ent *ent)
static void cmd_free_index(struct mlx5_cmd *cmd, int idx)
{
lockdep_assert_held(&cmd->alloc_lock);
+ cmd->ent_arr[idx] = NULL;
set_bit(idx, &cmd->vars.bitmask);
}
@@ -1200,6 +1201,44 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
return err;
}
+/* Check if all command slots are stalled (timed out and not recovered).
+ * returns true if all slots timed out on a recent command and have not been
+ * completed by FW yet. (stalled state)
+ * false otherwise (at least one slot is not stalled).
+ *
+ * In such odd situation "all_stalled", this serves as a protection mechanism
+ * to avoid blocking the kernel for long periods of time in case FW is not
+ * responding to commands.
+ */
+static bool mlx5_cmd_all_stalled(struct mlx5_core_dev *dev)
+{
+ struct mlx5_cmd *cmd = &dev->cmd;
+ bool all_stalled = true;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&cmd->alloc_lock, flags);
+
+ /* at least one command slot is free */
+ if (bitmap_weight(&cmd->vars.bitmask, cmd->vars.max_reg_cmds) > 0) {
+ all_stalled = false;
+ goto out;
+ }
+
+ for_each_clear_bit(i, &cmd->vars.bitmask, cmd->vars.max_reg_cmds) {
+ struct mlx5_cmd_work_ent *ent = dev->cmd.ent_arr[i];
+
+ if (!test_bit(MLX5_CMD_ENT_STATE_TIMEDOUT, &ent->state)) {
+ all_stalled = false;
+ break;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+
+ return all_stalled;
+}
+
/* Notes:
* 1. Callback functions may not sleep
* 2. page queue commands do not support asynchrous completion
@@ -1230,6 +1269,15 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
if (callback && page_queue)
return -EINVAL;
+ if (!page_queue && mlx5_cmd_all_stalled(dev)) {
+ mlx5_core_err_rl(dev,
+ "All CMD slots are stalled, aborting command\n");
+ /* there's no reason to wait and block the whole kernel if FW
+ * isn't currently responding to all slots, fail immediately
+ */
+ return -EAGAIN;
+ }
+
ent = cmd_alloc_ent(cmd, in, out, uout, uout_size,
callback, context, page_queue);
if (IS_ERR(ent))
@@ -1700,6 +1748,13 @@ static void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool force
if (test_bit(i, &vector)) {
ent = cmd->ent_arr[i];
+ if (forced && ent->ret == -ETIMEDOUT)
+ set_bit(MLX5_CMD_ENT_STATE_TIMEDOUT,
+ &ent->state);
+ else if (!forced) /* real FW completion */
+ clear_bit(MLX5_CMD_ENT_STATE_TIMEDOUT,
+ &ent->state);
+
/* if we already completed the command, ignore it */
if (!test_and_clear_bit(MLX5_CMD_ENT_STATE_PENDING_COMP,
&ent->state)) {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 046396269ccf..7aec53371cf0 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -819,6 +819,7 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
enum {
MLX5_CMD_ENT_STATE_PENDING_COMP,
+ MLX5_CMD_ENT_STATE_TIMEDOUT,
};
struct mlx5_cmd_work_ent {
--
2.31.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 5/5] net/mlx5: Use EOPNOTSUPP instead of ENOTSUPP
2025-11-17 21:42 [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 Tariq Toukan
` (3 preceding siblings ...)
2025-11-17 21:42 ` [PATCH net-next 4/5] net/mlx5: Abort new commands if all command slots are stalled Tariq Toukan
@ 2025-11-17 21:42 ` Tariq Toukan
2025-11-19 3:20 ` [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 patchwork-bot+netdevbpf
5 siblings, 0 replies; 9+ messages in thread
From: Tariq Toukan @ 2025-11-17 21:42 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Richard Cochran, netdev, linux-rdma, linux-kernel, Gal Pressman,
Moshe Shemesh, Dragos Tatulea
Per Documentation/dev-tools/checkpatch.rst, ENOTSUPP is not a standard
error code and should be avoided. EOPNOTSUPP should be used instead.
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c | 2 +-
.../ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c | 8 ++++----
5 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
index 080e7eab52c7..7bcf822a89f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c
@@ -54,7 +54,7 @@ static int mlx5_query_mtrc_caps(struct mlx5_fw_tracer *tracer)
if (!MLX5_GET(mtrc_cap, out, trace_to_memory)) {
mlx5_core_dbg(dev, "FWTracer: Device does not support logging traces to memory\n");
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
tracer->trc_ver = MLX5_GET(mtrc_cap, out, trc_ver);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
index 79916f1abd14..63bdef5b4ba5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs_ethtool.c
@@ -704,7 +704,7 @@ static int validate_flow(struct mlx5e_priv *priv,
num_tuples += ret;
break;
default:
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
if ((fs->flow_type & FLOW_EXT)) {
ret = validate_vlan(fs);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
index e5c1012921d2..1ec61164e6b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/core.c
@@ -211,7 +211,7 @@ int mlx5_fpga_device_start(struct mlx5_core_dev *mdev)
max_num_qps = MLX5_CAP_FPGA(mdev, shell_caps.max_num_qps);
if (!max_num_qps) {
mlx5_fpga_err(fdev, "FPGA reports 0 QPs in SHELL_CAPS\n");
- err = -ENOTSUPP;
+ err = -EOPNOTSUPP;
goto out;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
index d55e15c1f380..304912637c35 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c
@@ -149,7 +149,7 @@ struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev)
struct mlx5_vxlan *vxlan;
if (!MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || !mlx5_core_is_pf(mdev))
- return ERR_PTR(-ENOTSUPP);
+ return ERR_PTR(-EOPNOTSUPP);
vxlan = kzalloc(sizeof(*vxlan), GFP_KERNEL);
if (!vxlan)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c
index 65740bb68b09..e8c67ed9f748 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/sws/dr_domain.c
@@ -410,7 +410,7 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev,
switch (dmn->type) {
case MLX5DR_DOMAIN_TYPE_NIC_RX:
if (!DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, rx))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
dmn->info.supp_sw_steering = true;
dmn->info.rx.type = DR_DOMAIN_NIC_TYPE_RX;
@@ -419,7 +419,7 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev,
break;
case MLX5DR_DOMAIN_TYPE_NIC_TX:
if (!DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, tx))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
dmn->info.supp_sw_steering = true;
dmn->info.tx.type = DR_DOMAIN_NIC_TYPE_TX;
@@ -428,10 +428,10 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev,
break;
case MLX5DR_DOMAIN_TYPE_FDB:
if (!dmn->info.caps.eswitch_manager)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (!DR_DOMAIN_SW_STEERING_SUPPORTED(dmn, fdb))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
dmn->info.rx.type = DR_DOMAIN_NIC_TYPE_RX;
dmn->info.tx.type = DR_DOMAIN_NIC_TYPE_TX;
--
2.31.1
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17
2025-11-17 21:42 [PATCH net-next 0/5] net/mlx5: misc changes 2025-11-17 Tariq Toukan
` (4 preceding siblings ...)
2025-11-17 21:42 ` [PATCH net-next 5/5] net/mlx5: Use EOPNOTSUPP instead of ENOTSUPP Tariq Toukan
@ 2025-11-19 3:20 ` patchwork-bot+netdevbpf
5 siblings, 0 replies; 9+ messages in thread
From: patchwork-bot+netdevbpf @ 2025-11-19 3:20 UTC (permalink / raw)
To: Tariq Toukan
Cc: edumazet, kuba, pabeni, andrew+netdev, davem, saeedm, leon,
mbloch, richardcochran, netdev, linux-rdma, linux-kernel, gal,
moshe, dtatulea
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Mon, 17 Nov 2025 23:42:04 +0200 you wrote:
> Hi,
>
> This series contains misc enhancements to the mlx5 driver.
>
> Regards,
> Tariq
>
> [...]
Here is the summary with links:
- [net-next,1/5] net/mlx5: Refactor EEPROM query error handling to return status separately
https://git.kernel.org/netdev/net-next/c/2e4c44b12f4d
- [net-next,2/5] net/mlx5e: Recover SQ on excessive PTP TX timestamp delta
https://git.kernel.org/netdev/net-next/c/391dad2e686f
- [net-next,3/5] net/mlx5: Remove redundant bw_share minimal value assignment
https://git.kernel.org/netdev/net-next/c/ea3270351c79
- [net-next,4/5] net/mlx5: Abort new commands if all command slots are stalled
https://git.kernel.org/netdev/net-next/c/fbb9933666e3
- [net-next,5/5] net/mlx5: Use EOPNOTSUPP instead of ENOTSUPP
https://git.kernel.org/netdev/net-next/c/70ca239b612c
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 9+ messages in thread