* [PATCH net-next 1/7] net/mlx5: Lag: refactor representor reload handling
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 17:57 ` Mark Bloch
2026-04-09 11:55 ` [PATCH net-next 2/7] net/mlx5: E-Switch, move work queue generation counter Tariq Toukan
` (6 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
Representor reload during LAG/MPESW transitions has to be repeated in
several flows, and each open‑coded loop was easy to get out of sync
when adding new flags or tweaking error handling. Move the sequencing
into a single helper so that all call sites share the same ordering
and checks
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 44 +++++++++++--------
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 1 +
.../ethernet/mellanox/mlx5/core/lag/mpesw.c | 12 ++---
3 files changed, 31 insertions(+), 26 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 449e4bd86c06..c402a8463081 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1093,6 +1093,27 @@ void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
}
}
+int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags)
+{
+ struct lag_func *pf;
+ int ret;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (!(pf->dev->priv.flags & flags)) {
+ struct mlx5_eswitch *esw;
+
+ esw = pf->dev->priv.eswitch;
+ ret = mlx5_eswitch_reload_ib_reps(esw);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
void mlx5_disable_lag(struct mlx5_lag *ldev)
{
bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
@@ -1130,9 +1151,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
mlx5_lag_add_devices(ldev);
if (shared_fdb)
- mlx5_ldev_for_each(i, 0, ldev)
- if (!(mlx5_lag_pf(ldev, i)->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
- mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
+ mlx5_lag_reload_ib_reps(ldev, MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV);
}
bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
@@ -1388,10 +1407,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
if (err) {
if (shared_fdb || roce_lag)
mlx5_lag_add_devices(ldev);
- if (shared_fdb) {
- mlx5_ldev_for_each(i, 0, ldev)
- mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
- }
+ if (shared_fdb)
+ mlx5_lag_reload_ib_reps(ldev, 0);
return;
}
@@ -1409,24 +1426,15 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
mlx5_nic_vport_enable_roce(dev);
}
} else if (shared_fdb) {
- int i;
-
dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
mlx5_rescan_drivers_locked(dev0);
-
- mlx5_ldev_for_each(i, 0, ldev) {
- err = mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
- if (err)
- break;
- }
-
+ err = mlx5_lag_reload_ib_reps(ldev, 0);
if (err) {
dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
mlx5_rescan_drivers_locked(dev0);
mlx5_deactivate_lag(ldev);
mlx5_lag_add_devices(ldev);
- mlx5_ldev_for_each(i, 0, ldev)
- mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
+ mlx5_lag_reload_ib_reps(ldev, 0);
mlx5_core_err(dev0, "Failed to enable lag\n");
return;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 6c911374f409..db561e306fc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -199,4 +199,5 @@ int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx);
int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
int mlx5_lag_num_devs(struct mlx5_lag *ldev);
int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
+int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags);
#endif /* __MLX5_LAG_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index 5eea12a6887a..4d68e3092a56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -70,7 +70,6 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
struct mlx5_core_dev *dev0;
int err;
- int i;
if (ldev->mode == MLX5_LAG_MODE_MPESW)
return 0;
@@ -103,11 +102,9 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
mlx5_rescan_drivers_locked(dev0);
- mlx5_ldev_for_each(i, 0, ldev) {
- err = mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
- if (err)
- goto err_rescan_drivers;
- }
+ err = mlx5_lag_reload_ib_reps(ldev, 0);
+ if (err)
+ goto err_rescan_drivers;
mlx5_lag_set_vports_agg_speed(ldev);
@@ -119,8 +116,7 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
mlx5_deactivate_lag(ldev);
err_add_devices:
mlx5_lag_add_devices(ldev);
- mlx5_ldev_for_each(i, 0, ldev)
- mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
+ mlx5_lag_reload_ib_reps(ldev, 0);
mlx5_mpesw_metadata_cleanup(ldev);
return err;
}
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH net-next 1/7] net/mlx5: Lag: refactor representor reload handling
2026-04-09 11:55 ` [PATCH net-next 1/7] net/mlx5: Lag: refactor representor reload handling Tariq Toukan
@ 2026-04-09 17:57 ` Mark Bloch
0 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 17:57 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@nvidia.com>
>
> Representor reload during LAG/MPESW transitions has to be repeated in
> several flows, and each open‑coded loop was easy to get out of sync
> when adding new flags or tweaking error handling. Move the sequencing
> into a single helper so that all call sites share the same ordering
> and checks
>
> Signed-off-by: Mark Bloch <mbloch@nvidia.com>
> Reviewed-by: Shay Drori <shayd@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
> .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 44 +++++++++++--------
> .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 1 +
> .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 12 ++---
> 3 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> index 449e4bd86c06..c402a8463081 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> @@ -1093,6 +1093,27 @@ void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
> }
> }
>
> +int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags)
> +{
> + struct lag_func *pf;
> + int ret;
> + int i;
> +
> + mlx5_ldev_for_each(i, 0, ldev) {
> + pf = mlx5_lag_pf(ldev, i);
> + if (!(pf->dev->priv.flags & flags)) {
> + struct mlx5_eswitch *esw;
> +
> + esw = pf->dev->priv.eswitch;
> + ret = mlx5_eswitch_reload_ib_reps(esw);
> + if (ret)
> + return ret;
Sashiko says:
"Does this early return break best-effort teardown and error recovery paths?
The new helper aborts on the first error, but the open-coded loops it
replaces used to run unconditionally."
Aware of this behavioral change, it is intentional.
In practice, if reloading the reps fails on one device,
continuing the loop does not result in a functional system.
The remaining devices will not operate correctly, and attempting
to proceed may further destabilize the driver state.
By aborting early, we avoid partially reinitializing the system
and instead leave it in a consistent (albeit degraded) state. In
this case, IB devices are not (re)created, which prevents user
access to a driver that is already in a broken state.
Mark
> + }
> + }
> +
> + return 0;
> +}
> +
> void mlx5_disable_lag(struct mlx5_lag *ldev)
> {
> bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
> @@ -1130,9 +1151,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
> mlx5_lag_add_devices(ldev);
>
> if (shared_fdb)
> - mlx5_ldev_for_each(i, 0, ldev)
> - if (!(mlx5_lag_pf(ldev, i)->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
> - mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
> + mlx5_lag_reload_ib_reps(ldev, MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV);
> }
>
> bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
> @@ -1388,10 +1407,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
> if (err) {
> if (shared_fdb || roce_lag)
> mlx5_lag_add_devices(ldev);
> - if (shared_fdb) {
> - mlx5_ldev_for_each(i, 0, ldev)
> - mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
> - }
> + if (shared_fdb)
> + mlx5_lag_reload_ib_reps(ldev, 0);
>
> return;
> }
> @@ -1409,24 +1426,15 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
> mlx5_nic_vport_enable_roce(dev);
> }
> } else if (shared_fdb) {
> - int i;
> -
> dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> mlx5_rescan_drivers_locked(dev0);
> -
> - mlx5_ldev_for_each(i, 0, ldev) {
> - err = mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
> - if (err)
> - break;
> - }
> -
> + err = mlx5_lag_reload_ib_reps(ldev, 0);
> if (err) {
> dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> mlx5_rescan_drivers_locked(dev0);
> mlx5_deactivate_lag(ldev);
> mlx5_lag_add_devices(ldev);
> - mlx5_ldev_for_each(i, 0, ldev)
> - mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
> + mlx5_lag_reload_ib_reps(ldev, 0);
> mlx5_core_err(dev0, "Failed to enable lag\n");
> return;
> }
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
> index 6c911374f409..db561e306fc7 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
> @@ -199,4 +199,5 @@ int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx);
> int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
> int mlx5_lag_num_devs(struct mlx5_lag *ldev);
> int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
> +int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags);
> #endif /* __MLX5_LAG_H__ */
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
> index 5eea12a6887a..4d68e3092a56 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
> @@ -70,7 +70,6 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
> int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
> struct mlx5_core_dev *dev0;
> int err;
> - int i;
>
> if (ldev->mode == MLX5_LAG_MODE_MPESW)
> return 0;
> @@ -103,11 +102,9 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
>
> dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> mlx5_rescan_drivers_locked(dev0);
> - mlx5_ldev_for_each(i, 0, ldev) {
> - err = mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
> - if (err)
> - goto err_rescan_drivers;
> - }
> + err = mlx5_lag_reload_ib_reps(ldev, 0);
> + if (err)
> + goto err_rescan_drivers;
>
> mlx5_lag_set_vports_agg_speed(ldev);
>
> @@ -119,8 +116,7 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
> mlx5_deactivate_lag(ldev);
> err_add_devices:
> mlx5_lag_add_devices(ldev);
> - mlx5_ldev_for_each(i, 0, ldev)
> - mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
> + mlx5_lag_reload_ib_reps(ldev, 0);
> mlx5_mpesw_metadata_cleanup(ldev);
> return err;
> }
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH net-next 2/7] net/mlx5: E-Switch, move work queue generation counter
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
2026-04-09 11:55 ` [PATCH net-next 1/7] net/mlx5: Lag: refactor representor reload handling Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 17:58 ` Mark Bloch
2026-04-09 11:55 ` [PATCH net-next 3/7] net/mlx5: E-Switch, introduce generic work queue dispatch helper Tariq Toukan
` (5 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
The generation counter in mlx5_esw_functions is used to detect stale
work items on the E-Switch work queue. Move it from mlx5_esw_functions
to the top-level mlx5_eswitch struct so it can guard all work types,
not just function-change events.
This is a mechanical refactor: no behavioral change.
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 3 ++-
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++--
3 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 123c96716a54..1986d4d0e886 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1075,7 +1075,7 @@ static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
mlx5_eswitch_is_funcs_handler(esw->dev)) {
mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
- atomic_inc(&esw->esw_funcs.generation);
+ atomic_inc(&esw->generation);
}
}
@@ -2072,6 +2072,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
mutex_init(&esw->state_lock);
init_rwsem(&esw->mode_lock);
refcount_set(&esw->qos.refcnt, 0);
+ atomic_set(&esw->generation, 0);
esw->enabled_vports = 0;
esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 5128f5020dae..0c3d2bdebf8c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -340,7 +340,6 @@ struct mlx5_host_work {
struct mlx5_esw_functions {
struct mlx5_nb nb;
- atomic_t generation;
bool host_funcs_disabled;
u16 num_vfs;
u16 num_ec_vfs;
@@ -410,6 +409,7 @@ struct mlx5_eswitch {
struct mlx5_devcom_comp_dev *devcom;
u16 enabled_ipsec_vf_count;
bool eswitch_operation_in_progress;
+ atomic_t generation;
};
void esw_offloads_disable(struct mlx5_eswitch *esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a078d06f4567..b2e7294d3a5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3667,7 +3667,7 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
devl_lock(devlink);
/* Stale work from one or more mode changes ago. Bail out. */
- if (work_gen != atomic_read(&esw->esw_funcs.generation))
+ if (work_gen != atomic_read(&esw->generation))
goto unlock;
new_num_vfs = MLX5_GET(query_esw_functions_out, out,
@@ -3729,7 +3729,7 @@ int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type
esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
host_work->esw = esw;
- host_work->work_gen = atomic_read(&esw_funcs->generation);
+ host_work->work_gen = atomic_read(&esw->generation);
INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
queue_work(esw->work_queue, &host_work->work);
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH net-next 2/7] net/mlx5: E-Switch, move work queue generation counter
2026-04-09 11:55 ` [PATCH net-next 2/7] net/mlx5: E-Switch, move work queue generation counter Tariq Toukan
@ 2026-04-09 17:58 ` Mark Bloch
0 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 17:58 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@nvidia.com>
>
> The generation counter in mlx5_esw_functions is used to detect stale
> work items on the E-Switch work queue. Move it from mlx5_esw_functions
> to the top-level mlx5_eswitch struct so it can guard all work types,
> not just function-change events.
>
> This is a mechanical refactor: no behavioral change.
>
> Signed-off-by: Mark Bloch <mbloch@nvidia.com>
> Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 3 ++-
> drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 2 +-
> drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++--
> 3 files changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> index 123c96716a54..1986d4d0e886 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> @@ -1075,7 +1075,7 @@ static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
> if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
> mlx5_eswitch_is_funcs_handler(esw->dev)) {
> mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
> - atomic_inc(&esw->esw_funcs.generation);
> + atomic_inc(&esw->generation);
> }
> }
>
> @@ -2072,6 +2072,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
> mutex_init(&esw->state_lock);
> init_rwsem(&esw->mode_lock);
> refcount_set(&esw->qos.refcnt, 0);
> + atomic_set(&esw->generation, 0);
>
> esw->enabled_vports = 0;
> esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> index 5128f5020dae..0c3d2bdebf8c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> @@ -340,7 +340,6 @@ struct mlx5_host_work {
>
> struct mlx5_esw_functions {
> struct mlx5_nb nb;
> - atomic_t generation;
> bool host_funcs_disabled;
> u16 num_vfs;
> u16 num_ec_vfs;
> @@ -410,6 +409,7 @@ struct mlx5_eswitch {
> struct mlx5_devcom_comp_dev *devcom;
> u16 enabled_ipsec_vf_count;
> bool eswitch_operation_in_progress;
> + atomic_t generation;
> };
>
> void esw_offloads_disable(struct mlx5_eswitch *esw);
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index a078d06f4567..b2e7294d3a5c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -3667,7 +3667,7 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
> devl_lock(devlink);
>
> /* Stale work from one or more mode changes ago. Bail out. */
> - if (work_gen != atomic_read(&esw->esw_funcs.generation))
> + if (work_gen != atomic_read(&esw->generation))
> goto unlock;
Sashiko writes:
> Does this generation counter pattern allow stale work to bypass the guard?
> While this issue seems to have existed before this patch, it appears the
> notifier can execute concurrently with the teardown path, creating a race
> condition.
> If a concurrent hardware event triggers mlx5_esw_funcs_changed_handler() on
> another CPU during mode teardown:
> 1. The teardown path calls mlx5_eswitch_event_handler_unregister(), which
> unregisters the notifier and increments esw->generation.
> 2. mlx5_eq_notifier_unregister() removes the callback but does not
> synchronize with currently executing RCU readers.
> 3. The concurrently running notifier reads the newly incremented generation
> number via atomic_read(&esw->generation) and queues host_work with it.
> 4. The worker thread (esw_vfs_changed_event_handler) blocks waiting for
> devl_lock(devlink), which is currently held by the teardown thread.
> 5. Once teardown finishes transitioning to legacy mode and drops the lock,
> the worker resumes and checks the generation.
> 6. Since both the work_gen and esw->generation match the newly incremented
> value, the guard is bypassed.
> Because teardown set esw_funcs.num_vfs = 0 but firmware might still report
> new_num_vfs > 0 (as VFs remain active), the worker calls
> mlx5_eswitch_load_vf_vports(). Could this execute offloads-specific
> initialization while the eswitch is in legacy mode and lead to state
> corruption?
False positive, atomic_notifier_call_chain() runs under rcu
read lock, while atomic_notifier_chain_unregister()
performs a synchronize_rcu() before returning.
Mark
>
> new_num_vfs = MLX5_GET(query_esw_functions_out, out,
> @@ -3729,7 +3729,7 @@ int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type
> esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
>
> host_work->esw = esw;
> - host_work->work_gen = atomic_read(&esw_funcs->generation);
> + host_work->work_gen = atomic_read(&esw->generation);
>
> INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
> queue_work(esw->work_queue, &host_work->work);
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH net-next 3/7] net/mlx5: E-Switch, introduce generic work queue dispatch helper
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
2026-04-09 11:55 ` [PATCH net-next 1/7] net/mlx5: Lag: refactor representor reload handling Tariq Toukan
2026-04-09 11:55 ` [PATCH net-next 2/7] net/mlx5: E-Switch, move work queue generation counter Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 11:55 ` [PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq Tariq Toukan
` (4 subsequent siblings)
7 siblings, 0 replies; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
Each E-Switch work item requires the same boilerplate: acquire the
devlink lock, check whether the work is stale, dispatch to the
appropriate handler, and release the lock. Factor this out.
Add a func callback to mlx5_host_work so the generic handler
esw_wq_handler() can dispatch to the right function without
duplicating locking logic. Introduce mlx5_esw_add_work() as the
single enqueue point: it stamps the work item with the current
generation counter and queues it onto the E-Switch work queue.
Refactor esw_vfs_changed_event_handler() to match the new contract:
it no longer receives work_gen or out as parameters. It queries
mlx5_esw_query_functions() itself and owns the kvfree() of the
result. The devlink lock is acquired and released by esw_wq_handler()
before dispatching, so the handler runs with the lock already held.
Update mlx5_esw_funcs_changed_handler() to use mlx5_esw_add_work().
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 1 +
.../mellanox/mlx5/core/eswitch_offloads.c | 77 +++++++++++--------
2 files changed, 45 insertions(+), 33 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 0c3d2bdebf8c..e3ab8a30c174 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -336,6 +336,7 @@ struct mlx5_host_work {
struct work_struct work;
struct mlx5_eswitch *esw;
int work_gen;
+ void (*func)(struct mlx5_eswitch *esw);
};
struct mlx5_esw_functions {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b2e7294d3a5c..23af5a12dc07 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3655,20 +3655,15 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
mutex_destroy(&esw->fdb_table.offloads.vports.lock);
}
-static void
-esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
- const u32 *out)
+static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
{
- struct devlink *devlink;
bool host_pf_disabled;
u16 new_num_vfs;
+ const u32 *out;
- devlink = priv_to_devlink(esw->dev);
- devl_lock(devlink);
-
- /* Stale work from one or more mode changes ago. Bail out. */
- if (work_gen != atomic_read(&esw->generation))
- goto unlock;
+ out = mlx5_esw_query_functions(esw->dev);
+ if (IS_ERR(out))
+ return;
new_num_vfs = MLX5_GET(query_esw_functions_out, out,
host_params_context.host_num_of_vfs);
@@ -3676,7 +3671,7 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
host_params_context.host_pf_disabled);
if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
- goto unlock;
+ goto free;
/* Number of VFs can only change from "0 to x" or "x to 0". */
if (esw->esw_funcs.num_vfs > 0) {
@@ -3686,54 +3681,70 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs,
MLX5_VPORT_UC_ADDR_CHANGE);
- if (err) {
- devl_unlock(devlink);
- return;
- }
+ if (err)
+ goto free;
}
esw->esw_funcs.num_vfs = new_num_vfs;
-unlock:
- devl_unlock(devlink);
+free:
+ kvfree(out);
}
-static void esw_functions_changed_event_handler(struct work_struct *work)
+static void esw_wq_handler(struct work_struct *work)
{
struct mlx5_host_work *host_work;
struct mlx5_eswitch *esw;
- const u32 *out;
+ struct devlink *devlink;
host_work = container_of(work, struct mlx5_host_work, work);
esw = host_work->esw;
+ devlink = priv_to_devlink(esw->dev);
- out = mlx5_esw_query_functions(esw->dev);
- if (IS_ERR(out))
- goto out;
+ devl_lock(devlink);
- esw_vfs_changed_event_handler(esw, host_work->work_gen, out);
- kvfree(out);
-out:
+ /* Stale work from one or more mode changes ago. Bail out. */
+ if (host_work->work_gen != atomic_read(&esw->generation))
+ goto unlock;
+
+ host_work->func(esw);
+
+unlock:
+ devl_unlock(devlink);
kfree(host_work);
}
-int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type, void *data)
+static int mlx5_esw_add_work(struct mlx5_eswitch *esw,
+ void (*func)(struct mlx5_eswitch *esw))
{
- struct mlx5_esw_functions *esw_funcs;
struct mlx5_host_work *host_work;
- struct mlx5_eswitch *esw;
host_work = kzalloc_obj(*host_work, GFP_ATOMIC);
if (!host_work)
- return NOTIFY_DONE;
-
- esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb);
- esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
+ return -ENOMEM;
host_work->esw = esw;
host_work->work_gen = atomic_read(&esw->generation);
- INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
+ host_work->func = func;
+ INIT_WORK(&host_work->work, esw_wq_handler);
queue_work(esw->work_queue, &host_work->work);
+ return 0;
+}
+
+int mlx5_esw_funcs_changed_handler(struct notifier_block *nb,
+ unsigned long type, void *data)
+{
+ struct mlx5_esw_functions *esw_funcs;
+ struct mlx5_eswitch *esw;
+ int ret;
+
+ esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb);
+ esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
+
+ ret = mlx5_esw_add_work(esw, esw_vfs_changed_event_handler);
+ if (ret)
+ return NOTIFY_DONE;
+
return NOTIFY_OK;
}
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* [PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
` (2 preceding siblings ...)
2026-04-09 11:55 ` [PATCH net-next 3/7] net/mlx5: E-Switch, introduce generic work queue dispatch helper Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 18:01 ` Mark Bloch
2026-04-09 11:55 ` [PATCH net-next 5/7] net/mlx5: E-Switch, block representors during reconfiguration Tariq Toukan
` (3 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
mlx5_eswitch_cleanup() calls destroy_workqueue() while holding the
devlink lock (via mlx5_uninit_one()). Workers on the queue call
devl_lock() before checking whether their work is stale, which
deadlocks:
mlx5_uninit_one (holds devlink lock)
mlx5_eswitch_cleanup()
destroy_workqueue() <- waits for workers to finish
worker: devl_lock() <- blocked on
devlink lock held above
The same pattern affects mlx5_devlink_eswitch_mode_set(), which can
drain the queue while holding devlink lock.
Fix by making esw_wq_handler() check the generation counter BEFORE
acquiring the devlink lock, using devl_trylock() in a loop with
cond_resched(). If the work is stale the handler exits immediately
without ever contending for the lock.
To guarantee stale detection, increment the generation counter at
every E-Switch operation boundary:
- mlx5_eswitch_cleanup(): increment before destroy_workqueue() so
any in-flight worker sees stale and drains without blocking. Also
move mlx5_esw_qos_cleanup() to after destroy_workqueue() so it
runs only once all workers have finished.
- mlx5_devlink_eswitch_mode_set(): increment before starting the
mode change so workers from the previous mode are discarded.
- mlx5_eswitch_disable(): increment so workers queued before the
disable see stale and exit.
- mlx5_eswitch_enable() and mlx5_eswitch_disable_sriov(): increment
so in-flight work against an old VF count or mode is discarded
when these operations begin.
Remove the conditional atomic_inc() in
mlx5_eswitch_event_handler_unregister(); the mlx5_eswitch_disable()
increment now covers it unconditionally and earlier in the call chain.
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.c | 11 +++++++----
.../mellanox/mlx5/core/eswitch_offloads.c | 18 +++++++++++++++++-
2 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1986d4d0e886..d315484390c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1073,10 +1073,8 @@ static void mlx5_eswitch_event_handler_register(struct mlx5_eswitch *esw)
static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
{
if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
- mlx5_eswitch_is_funcs_handler(esw->dev)) {
+ mlx5_eswitch_is_funcs_handler(esw->dev))
mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
- atomic_inc(&esw->generation);
- }
}
static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw)
@@ -1701,6 +1699,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
if (toggle_lag)
mlx5_lag_disable_change(esw->dev);
+ atomic_inc(&esw->generation);
+
if (!mlx5_esw_is_fdb_created(esw)) {
ret = mlx5_eswitch_enable_locked(esw, num_vfs);
} else {
@@ -1745,6 +1745,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
esw_info(esw->dev, "Unload vfs: mode(%s), nvfs(%d), necvfs(%d), active vports(%d)\n",
esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
+ atomic_inc(&esw->generation);
if (!mlx5_core_is_ecpf(esw->dev)) {
mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
@@ -1809,6 +1810,7 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
return;
devl_assert_locked(priv_to_devlink(esw->dev));
+ atomic_inc(&esw->generation);
mlx5_lag_disable_change(esw->dev);
mlx5_eswitch_disable_locked(esw);
esw->mode = MLX5_ESWITCH_LEGACY;
@@ -2110,8 +2112,9 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
esw_info(esw->dev, "cleanup\n");
- mlx5_esw_qos_cleanup(esw);
+ atomic_inc(&esw->generation);
destroy_workqueue(esw->work_queue);
+ mlx5_esw_qos_cleanup(esw);
WARN_ON(refcount_read(&esw->qos.refcnt));
mutex_destroy(&esw->state_lock);
WARN_ON(!xa_empty(&esw->offloads.vhca_map));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 23af5a12dc07..988595e1b425 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3699,7 +3699,20 @@ static void esw_wq_handler(struct work_struct *work)
esw = host_work->esw;
devlink = priv_to_devlink(esw->dev);
- devl_lock(devlink);
+ /* Check for stale work BEFORE acquiring devlink lock.
+ * mlx5_eswitch_cleanup() increments the generation counter
+ * before destroy_workqueue() while holding devlink lock,
+ * so acquiring devlink lock here would deadlock.
+ */
+ for (;;) {
+ if (host_work->work_gen != atomic_read(&esw->generation))
+ goto free;
+
+ if (devl_trylock(devlink))
+ break;
+
+ cond_resched();
+ }
/* Stale work from one or more mode changes ago. Bail out. */
if (host_work->work_gen != atomic_read(&esw->generation))
@@ -3709,6 +3722,7 @@ static void esw_wq_handler(struct work_struct *work)
unlock:
devl_unlock(devlink);
+free:
kfree(host_work);
}
@@ -4161,6 +4175,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
goto skip;
}
+ atomic_inc(&esw->generation);
+
if (mlx5_mode == MLX5_ESWITCH_LEGACY)
esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY;
if (mlx5_mode == MLX5_ESWITCH_OFFLOADS)
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq
2026-04-09 11:55 ` [PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq Tariq Toukan
@ 2026-04-09 18:01 ` Mark Bloch
0 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 18:01 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@nvidia.com>
>
> mlx5_eswitch_cleanup() calls destroy_workqueue() while holding the
> devlink lock (via mlx5_uninit_one()). Workers on the queue call
> devl_lock() before checking whether their work is stale, which
> deadlocks:
>
> mlx5_uninit_one (holds devlink lock)
> mlx5_eswitch_cleanup()
> destroy_workqueue() <- waits for workers to finish
> worker: devl_lock() <- blocked on
> devlink lock held above
>
> The same pattern affects mlx5_devlink_eswitch_mode_set(), which can
> drain the queue while holding devlink lock.
>
> Fix by making esw_wq_handler() check the generation counter BEFORE
> acquiring the devlink lock, using devl_trylock() in a loop with
> cond_resched(). If the work is stale the handler exits immediately
> without ever contending for the lock.
>
> To guarantee stale detection, increment the generation counter at
> every E-Switch operation boundary:
>
> - mlx5_eswitch_cleanup(): increment before destroy_workqueue() so
> any in-flight worker sees stale and drains without blocking. Also
> move mlx5_esw_qos_cleanup() to after destroy_workqueue() so it
> runs only once all workers have finished.
> - mlx5_devlink_eswitch_mode_set(): increment before starting the
> mode change so workers from the previous mode are discarded.
> - mlx5_eswitch_disable(): increment so workers queued before the
> disable see stale and exit.
> - mlx5_eswitch_enable() and mlx5_eswitch_disable_sriov(): increment
> so in-flight work against an old VF count or mode is discarded
> when these operations begin.
>
> Remove the conditional atomic_inc() in
> mlx5_eswitch_event_handler_unregister(); the mlx5_eswitch_disable()
> increment now covers it unconditionally and earlier in the call chain.
>
> Signed-off-by: Mark Bloch <mbloch@nvidia.com>
> Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Sashiko writes:
> This commit modifies the E-Switch workqueue handlers to use a generation
> counter and a trylock loop to avoid deadlocks when tearing down the
> workqueue while holding the devlink lock.
> Does the open-coded trylock retry loop in esw_wq_handler() violate the
> networking subsystem locking guidelines?
> The updated handler uses a loop that repeatedly calls devl_trylock() and
> cond_resched() to wait for the lock. The networking guidelines explicitly
> warn that using a trylock in a loop to avoid a lock ordering issue is a
> sign of incorrect locking design.
> Can this lead to a priority inversion livelock? If a thread holds the
> devlink lock for a long-running operation and is preempted, a worker thread
> with a higher scheduling priority could spin continuously in this loop,
> preventing the lock holder from ever completing its work and releasing the
> lock.
There is no such worker today, although in theory such a scenario is
possible. It is the driver’s responsibility to ensure that long-held
devlink locks are not combined with conditions that could lead to such
starvation.
> Does moving the atomic_inc() of esw->generation before unregistering the EQ
> notifier introduce a race condition?
> In mlx5_eswitch_disable() and mlx5_devlink_eswitch_mode_set(), the
> generation counter is now incremented before mlx5_eswitch_disable_locked()
> unregisters the asynchronous EQ notifier.
> If a hardware EQ event fires in the narrow window after the generation
> counter is incremented but before the notifier is unregistered:
> - The event handler queues a new work item tagged with the newly incremented
> generation counter.
> - The teardown or mode change finishes and releases the devlink lock.
> - The worker thread executes, and because its host_work->work_gen matches
> the new esw->generation, the stale check evaluates to false.
> - The worker then executes its payload on an E-Switch that has already been
> transitioned to legacy mode or disabled.
> Could this sequence lead to state corruption by running offloads setup code
> on a torn-down E-Switch?
Good catch, this looks like a real issue don't know why our internal sashiko
didn't flag this :/
In both mlx5_devlink_eswitch_mode_set() and mlx5_eswitch_disable(),
the atomic_inc() should come after
mlx5_eswitch_disable_locked() to avoid the race described above.
Commit aed763abf0e905b4b8d747d1ba9e172961572f57 was intended to address
this class of problems. Some of the discussion around it can be found here:
https://lore.kernel.org/all/1769503961-124173-3-git-send-email-tariqt@nvidia.com/
However, Cosmin’s approach does not cover all deadlock scenarios.
Specifically, mlx5_eswitch_cleanup() is invoked under the devlink lock
and calls destroy_workqueue(esw->work_queue). If work was already queued,
this can still lead to a deadlock.
It’s also worth noting that mlx5_eswitch_cleanup() is only reached when
the driver is being torn down. At that point we must guarantee that all
resources are freed, so some form of synchronization / waiting is
unavoidable.
Mark
> ---
> .../net/ethernet/mellanox/mlx5/core/eswitch.c | 11 +++++++----
> .../mellanox/mlx5/core/eswitch_offloads.c | 18 +++++++++++++++++-
> 2 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> index 1986d4d0e886..d315484390c8 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> @@ -1073,10 +1073,8 @@ static void mlx5_eswitch_event_handler_register(struct mlx5_eswitch *esw)
> static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
> {
> if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
> - mlx5_eswitch_is_funcs_handler(esw->dev)) {
> + mlx5_eswitch_is_funcs_handler(esw->dev))
> mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
> - atomic_inc(&esw->generation);
> - }
> }
>
> static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw)
> @@ -1701,6 +1699,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
> if (toggle_lag)
> mlx5_lag_disable_change(esw->dev);
>
> + atomic_inc(&esw->generation);
> +
> if (!mlx5_esw_is_fdb_created(esw)) {
> ret = mlx5_eswitch_enable_locked(esw, num_vfs);
> } else {
> @@ -1745,6 +1745,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
> esw_info(esw->dev, "Unload vfs: mode(%s), nvfs(%d), necvfs(%d), active vports(%d)\n",
> esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
> esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
> + atomic_inc(&esw->generation);
>
> if (!mlx5_core_is_ecpf(esw->dev)) {
> mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
> @@ -1809,6 +1810,7 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
> return;
>
> devl_assert_locked(priv_to_devlink(esw->dev));
> + atomic_inc(&esw->generation);
> mlx5_lag_disable_change(esw->dev);
> mlx5_eswitch_disable_locked(esw);
> esw->mode = MLX5_ESWITCH_LEGACY;
> @@ -2110,8 +2112,9 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
>
> esw_info(esw->dev, "cleanup\n");
>
> - mlx5_esw_qos_cleanup(esw);
> + atomic_inc(&esw->generation);
> destroy_workqueue(esw->work_queue);
> + mlx5_esw_qos_cleanup(esw);
> WARN_ON(refcount_read(&esw->qos.refcnt));
> mutex_destroy(&esw->state_lock);
> WARN_ON(!xa_empty(&esw->offloads.vhca_map));
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 23af5a12dc07..988595e1b425 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -3699,7 +3699,20 @@ static void esw_wq_handler(struct work_struct *work)
> esw = host_work->esw;
> devlink = priv_to_devlink(esw->dev);
>
> - devl_lock(devlink);
> + /* Check for stale work BEFORE acquiring devlink lock.
> + * mlx5_eswitch_cleanup() increments the generation counter
> + * before destroy_workqueue() while holding devlink lock,
> + * so acquiring devlink lock here would deadlock.
> + */
> + for (;;) {
> + if (host_work->work_gen != atomic_read(&esw->generation))
> + goto free;
> +
> + if (devl_trylock(devlink))
> + break;
> +
> + cond_resched();
> + }
>
> /* Stale work from one or more mode changes ago. Bail out. */
> if (host_work->work_gen != atomic_read(&esw->generation))
> @@ -3709,6 +3722,7 @@ static void esw_wq_handler(struct work_struct *work)
>
> unlock:
> devl_unlock(devlink);
> +free:
> kfree(host_work);
> }
>
> @@ -4161,6 +4175,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
> goto skip;
> }
>
> + atomic_inc(&esw->generation);
> +
> if (mlx5_mode == MLX5_ESWITCH_LEGACY)
> esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY;
> if (mlx5_mode == MLX5_ESWITCH_OFFLOADS)
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH net-next 5/7] net/mlx5: E-Switch, block representors during reconfiguration
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
` (3 preceding siblings ...)
2026-04-09 11:55 ` [PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 18:02 ` Mark Bloch
2026-04-09 11:55 ` [PATCH net-next 6/7] net/mlx5: E-switch, load reps via work queue after registration Tariq Toukan
` (2 subsequent siblings)
7 siblings, 1 reply; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
Introduce a simple atomic block state via mlx5_esw_reps_block() and
mlx5_esw_reps_unblock(). Internally, mlx5_esw_mark_reps() spins a
cmpxchg between the UNBLOCKED and BLOCKED states. All E-Switch
reconfiguration paths (mode set, enable, disable, VF/SF add/del, LAG
reload) now bracket their work with this guard so representor changes
won't race with the ongoing E-Switch update, yet we remain
non-blocking and avoid new locks.
A spinlock is out because the protected work can sleep (RDMA ops,
devcom, netdev callbacks). A mutex won't work either: esw_mode_change()
has to drop the guard mid-flight so mlx5_rescan_drivers_locked() can
reload mlx5_ib, which calls back into mlx5_eswitch_register_vport_reps()
on the same thread. Beyond that, any real lock would create an ABBA
cycle: the LAG side holds the LAG lock when it calls reps_block(), and
the mlx5_ib side holds RDMA locks when it calls register_vport_reps(),
and those two subsystems talk to each other. The atomic CAS loop avoids
all of this - no lock ordering, no sleep restrictions, and the owner
can drop the guard and let a nested caller win the next transition
before reclaiming it.
With this infrastructure in place, downstream patches can safely tie
representor load/unload to the mlx5_ib module's lifecycle. Loading
mlx5_ib while the device is in switchdev mode has failed to bring up
the IB representors for years; those patches will finally fix that.
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.c | 13 ++++
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 6 ++
.../mellanox/mlx5/core/eswitch_offloads.c | 77 +++++++++++++++++--
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 2 +
.../ethernet/mellanox/mlx5/core/sf/devlink.c | 5 ++
include/linux/mlx5/eswitch.h | 5 ++
6 files changed, 100 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index d315484390c8..a7701c9d776a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1700,6 +1700,7 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
mlx5_lag_disable_change(esw->dev);
atomic_inc(&esw->generation);
+ mlx5_esw_reps_block(esw);
if (!mlx5_esw_is_fdb_created(esw)) {
ret = mlx5_eswitch_enable_locked(esw, num_vfs);
@@ -1723,6 +1724,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
}
}
+ mlx5_esw_reps_unblock(esw);
+
if (toggle_lag)
mlx5_lag_enable_change(esw->dev);
@@ -1747,6 +1750,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
atomic_inc(&esw->generation);
+ mlx5_esw_reps_block(esw);
+
if (!mlx5_core_is_ecpf(esw->dev)) {
mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
if (clear_vf)
@@ -1757,6 +1762,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
mlx5_eswitch_clear_ec_vf_vports_info(esw);
}
+ mlx5_esw_reps_unblock(esw);
+
if (esw->mode == MLX5_ESWITCH_OFFLOADS) {
struct devlink *devlink = priv_to_devlink(esw->dev);
@@ -1812,7 +1819,11 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
devl_assert_locked(priv_to_devlink(esw->dev));
atomic_inc(&esw->generation);
mlx5_lag_disable_change(esw->dev);
+
+ mlx5_esw_reps_block(esw);
mlx5_eswitch_disable_locked(esw);
+ mlx5_esw_reps_unblock(esw);
+
esw->mode = MLX5_ESWITCH_LEGACY;
mlx5_lag_enable_change(esw->dev);
}
@@ -2075,6 +2086,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
init_rwsem(&esw->mode_lock);
refcount_set(&esw->qos.refcnt, 0);
atomic_set(&esw->generation, 0);
+ atomic_set(&esw->offloads.reps_conf_state,
+ MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED);
esw->enabled_vports = 0;
esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index e3ab8a30c174..256ac3ad37bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -315,6 +315,7 @@ struct mlx5_esw_offload {
DECLARE_HASHTABLE(termtbl_tbl, 8);
struct mutex termtbl_mutex; /* protects termtbl hash */
struct xarray vhca_map;
+ atomic_t reps_conf_state;
const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
u8 inline_mode;
atomic64_t num_flows;
@@ -949,6 +950,8 @@ mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw,
struct mlx5_flow_handle *
mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num,
struct mlx5_flow_table *lag_ft);
+void mlx5_esw_reps_block(struct mlx5_eswitch *esw);
+void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw);
#else /* CONFIG_MLX5_ESWITCH */
/* eswitch API stubs */
static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
@@ -1026,6 +1029,9 @@ mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev)
return true;
}
+static inline void mlx5_esw_reps_block(struct mlx5_eswitch *esw) {}
+static inline void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw) {}
+
static inline bool
mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 988595e1b425..4b626ffcfa8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2410,23 +2410,56 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw)
return err;
}
+static void mlx5_esw_assert_reps_blocked(struct mlx5_eswitch *esw)
+{
+ if (atomic_read(&esw->offloads.reps_conf_state) ==
+ MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED)
+ return;
+
+ esw_warn(esw->dev, "reps state machine violated: expected BLOCKED\n");
+}
+
+static void mlx5_esw_mark_reps(struct mlx5_eswitch *esw,
+ enum mlx5_esw_offloads_rep_type_state old,
+ enum mlx5_esw_offloads_rep_type_state new)
+{
+ atomic_t *reps_conf_state = &esw->offloads.reps_conf_state;
+
+ do {
+ atomic_cond_read_relaxed(reps_conf_state, VAL == old);
+ } while (atomic_cmpxchg(reps_conf_state, old, new) != old);
+}
+
+void mlx5_esw_reps_block(struct mlx5_eswitch *esw)
+{
+ mlx5_esw_mark_reps(esw, MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED,
+ MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED);
+}
+
+void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw)
+{
+ mlx5_esw_mark_reps(esw, MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED,
+ MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED);
+}
+
static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode)
{
+ mlx5_esw_reps_unblock(esw);
mlx5_devcom_comp_lock(esw->dev->priv.hca_devcom_comp);
if (esw->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV ||
mlx5_core_mp_enabled(esw->dev)) {
esw->mode = mode;
- mlx5_rescan_drivers_locked(esw->dev);
- mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
- return;
+ goto out;
}
esw->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
mlx5_rescan_drivers_locked(esw->dev);
esw->mode = mode;
esw->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+out:
mlx5_rescan_drivers_locked(esw->dev);
mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
+ mlx5_esw_reps_block(esw);
}
static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw)
@@ -2761,6 +2794,8 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep, u8 rep_type)
{
+ mlx5_esw_assert_reps_blocked(esw);
+
if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
REP_REGISTERED, REP_LOADED) == REP_REGISTERED)
return esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
@@ -2771,6 +2806,8 @@ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
struct mlx5_eswitch_rep *rep, u8 rep_type)
{
+ mlx5_esw_assert_reps_blocked(esw);
+
if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
REP_LOADED, REP_REGISTERED) == REP_LOADED) {
if (rep_type == REP_ETH)
@@ -3673,6 +3710,7 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
goto free;
+ mlx5_esw_reps_block(esw);
/* Number of VFs can only change from "0 to x" or "x to 0". */
if (esw->esw_funcs.num_vfs > 0) {
mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
@@ -3682,9 +3720,11 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs,
MLX5_VPORT_UC_ADDR_CHANGE);
if (err)
- goto free;
+ goto unblock;
}
esw->esw_funcs.num_vfs = new_num_vfs;
+unblock:
+ mlx5_esw_reps_unblock(esw);
free:
kvfree(out);
}
@@ -4164,6 +4204,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
goto unlock;
}
+ mlx5_esw_reps_block(esw);
esw->eswitch_operation_in_progress = true;
up_write(&esw->mode_lock);
@@ -4203,6 +4244,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
mlx5_devlink_netdev_netns_immutable_set(devlink, false);
down_write(&esw->mode_lock);
esw->eswitch_operation_in_progress = false;
+ mlx5_esw_reps_unblock(esw);
unlock:
mlx5_esw_unlock(esw);
enable_lag:
@@ -4474,9 +4516,10 @@ mlx5_eswitch_vport_has_rep(const struct mlx5_eswitch *esw, u16 vport_num)
return true;
}
-void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
- const struct mlx5_eswitch_rep_ops *ops,
- u8 rep_type)
+static void
+mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
+ const struct mlx5_eswitch_rep_ops *ops,
+ u8 rep_type)
{
struct mlx5_eswitch_rep_data *rep_data;
struct mlx5_eswitch_rep *rep;
@@ -4491,9 +4534,20 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
}
}
}
+
+void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
+ const struct mlx5_eswitch_rep_ops *ops,
+ u8 rep_type)
+{
+ mlx5_esw_reps_block(esw);
+ mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
+ mlx5_esw_reps_unblock(esw);
+}
EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
-void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
+static void
+mlx5_eswitch_unregister_vport_reps_blocked(struct mlx5_eswitch *esw,
+ u8 rep_type)
{
struct mlx5_eswitch_rep *rep;
unsigned long i;
@@ -4504,6 +4558,13 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
mlx5_esw_for_each_rep(esw, i, rep)
atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED);
}
+
+void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
+{
+ mlx5_esw_reps_block(esw);
+ mlx5_eswitch_unregister_vport_reps_blocked(esw, rep_type);
+ mlx5_esw_reps_unblock(esw);
+}
EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index c402a8463081..ff2e6f6caa0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1105,7 +1105,9 @@ int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags)
struct mlx5_eswitch *esw;
esw = pf->dev->priv.eswitch;
+ mlx5_esw_reps_block(esw);
ret = mlx5_eswitch_reload_ib_reps(esw);
+ mlx5_esw_reps_unblock(esw);
if (ret)
return ret;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 8503e532f423..2fc69897e35b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -245,8 +245,10 @@ static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table,
if (IS_ERR(sf))
return PTR_ERR(sf);
+ mlx5_esw_reps_block(esw);
err = mlx5_eswitch_load_sf_vport(esw, sf->hw_fn_id, MLX5_VPORT_UC_ADDR_CHANGE,
&sf->dl_port, new_attr->controller, new_attr->sfnum);
+ mlx5_esw_reps_unblock(esw);
if (err)
goto esw_err;
*dl_port = &sf->dl_port.dl_port;
@@ -367,7 +369,10 @@ int mlx5_devlink_sf_port_del(struct devlink *devlink,
struct mlx5_sf_table *table = dev->priv.sf_table;
struct mlx5_sf *sf = mlx5_sf_by_dl_port(dl_port);
+ mlx5_esw_reps_block(dev->priv.eswitch);
mlx5_sf_del(table, sf);
+ mlx5_esw_reps_unblock(dev->priv.eswitch);
+
return 0;
}
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 67256e776566..786b1ea83843 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -29,6 +29,11 @@ enum {
REP_LOADED,
};
+enum mlx5_esw_offloads_rep_type_state {
+ MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED,
+ MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED,
+};
+
enum mlx5_switchdev_event {
MLX5_SWITCHDEV_EVENT_PAIR,
MLX5_SWITCHDEV_EVENT_UNPAIR,
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH net-next 5/7] net/mlx5: E-Switch, block representors during reconfiguration
2026-04-09 11:55 ` [PATCH net-next 5/7] net/mlx5: E-Switch, block representors during reconfiguration Tariq Toukan
@ 2026-04-09 18:02 ` Mark Bloch
0 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 18:02 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@nvidia.com>
>
> Introduce a simple atomic block state via mlx5_esw_reps_block() and
> mlx5_esw_reps_unblock(). Internally, mlx5_esw_mark_reps() spins a
> cmpxchg between the UNBLOCKED and BLOCKED states. All E-Switch
> reconfiguration paths (mode set, enable, disable, VF/SF add/del, LAG
> reload) now bracket their work with this guard so representor changes
> won't race with the ongoing E-Switch update, yet we remain
> non-blocking and avoid new locks.
>
> A spinlock is out because the protected work can sleep (RDMA ops,
> devcom, netdev callbacks). A mutex won't work either: esw_mode_change()
> has to drop the guard mid-flight so mlx5_rescan_drivers_locked() can
> reload mlx5_ib, which calls back into mlx5_eswitch_register_vport_reps()
> on the same thread. Beyond that, any real lock would create an ABBA
> cycle: the LAG side holds the LAG lock when it calls reps_block(), and
> the mlx5_ib side holds RDMA locks when it calls register_vport_reps(),
> and those two subsystems talk to each other. The atomic CAS loop avoids
> all of this - no lock ordering, no sleep restrictions, and the owner
> can drop the guard and let a nested caller win the next transition
> before reclaiming it.
>
> With this infrastructure in place, downstream patches can safely tie
> representor load/unload to the mlx5_ib module's lifecycle. Loading
> mlx5_ib while the device is in switchdev mode has failed to bring up
> the IB representors for years; those patches will finally fix that.
>
Sashiko writes:
> Note: The format of this report is altered to provide a free-form summary of
> the findings without directly quoting the original patch code due to system
> recitation restrictions.
> commit c88c0130c17d45e785dce9dae505cf3bd9664ab3
> Author: Mark Bloch <mbloch@nvidia.com>
> net/mlx5: E-Switch, block representors during reconfiguration
> This commit introduces an atomic block state via mlx5_esw_reps_block() and
> mlx5_esw_reps_unblock() to guard E-Switch reconfiguration paths. It uses an
> atomic CAS loop instead of standard locks to avoid sleep restrictions and an
> existing ABBA deadlock cycle.
> The commit message states that an atomic CAS loop is used because the
> protected work can sleep and a real lock would create an ABBA cycle. Does
> introducing an ad-hoc busy-wait lock to protect sleepable sections risk
> soft lockups or priority inversion when the lock owner sleeps?
> Furthermore, does bypassing lockdep with a custom loop hide the underlying
> ABBA deadlock rather than resolving it structurally? Could a standard
> synchronization primitive like a mutex be used while fixing the lock
> inversion?
What we actually guard here is the assignment/clearing of
esw->offloads.rep_ops[rep_type].
The root of the problem is the current mlx5 architecture, where
mlx5_core and mlx5_ib are separate modules. The auxiliary bus was
introduced to decouple them, but it also introduced additional
complexity.
In particular, during eswitch mode transitions (handled in mlx5_core),
the IB module may be loaded or unloaded concurrently. While this may
sound benign, switching into or out of switchdev mode requires creating
or destroying an IB device, which makes the interaction non-trivial.
To handle this, the eswitch code performs a fairly complex sequence of
auxiliary device reloads during mode transitions, to ensure the system
remains consistent.
The api here is intentionally minimal: the critical paths are
mlx5_eswitch_register_vport_reps() and
mlx5_eswitch_unregister_vport_reps(), which are invoked by mlx5_ib.
These functions primarily assign state and ops, and the goal is to
ensure that rep_ops is not cleared or reassigned in the middle of a
representor load/unload sequence.
There is still a known issue on the unregister path, where representors
are torn down. I plan to address this in a follow-up series. This patch
does not make the situation worse; it ensures that all existing cases
are handled, with the remaining gap being that unloading should not be
performed directly under mlx5_eswitch_unregister_vport_reps(). This
will be fixed separately.
> In esw_mode_change(), the representor block is intentionally released and
> then reacquired to allow mlx5_rescan_drivers_locked() to run. Because this
> custom atomic guard lacks owner tracking, does dropping it mid-flight open a
> window where any other concurrent thread could acquire the block and mutate
> representor state? Does this defeat the mutual exclusion the block was
> intended to provide against concurrent operations?
This is intentional.
We do not require strict exclusion with respect to IB representors
during mode transitions. The requirement is that the eswitch and the
driver remain in a consistent and safe state (i.e., no crashes or
corruption).
If userspace races IB module load/unload with E-Switch mode changes,
the worst expected outcome after this change is that no IB device is
present. That is considered acceptable behavior.
> The mlx5_esw_assert_reps_blocked() function checks if the global
> reps_conf_state is set to BLOCKED. Since there is no lock ownership
> tracking, if one thread holds the block, wouldn't this assertion silently
> pass for an entirely different thread that failed to acquire the block?
> Could this provide a false sense of security compared to a proper
> lockdep_assert_held() check?
This assertion is not meant to provide ownership guarantees like
lockdep_assert_held(). Its purpose is to catch incorrect usage of the
api.
In practice, there is effectively a single call path that manipulates
representors at a time, and triggering concurrent access requires deliberate
effort. The assertion is therefore primarily a safeguard against future
code paths accidentally bypassing the intended sequencing.
Mark
> Signed-off-by: Mark Bloch <mbloch@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
> .../net/ethernet/mellanox/mlx5/core/eswitch.c | 13 ++++
> .../net/ethernet/mellanox/mlx5/core/eswitch.h | 6 ++
> .../mellanox/mlx5/core/eswitch_offloads.c | 77 +++++++++++++++++--
> .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 2 +
> .../ethernet/mellanox/mlx5/core/sf/devlink.c | 5 ++
> include/linux/mlx5/eswitch.h | 5 ++
> 6 files changed, 100 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> index d315484390c8..a7701c9d776a 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> @@ -1700,6 +1700,7 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
> mlx5_lag_disable_change(esw->dev);
>
> atomic_inc(&esw->generation);
> + mlx5_esw_reps_block(esw);
>
> if (!mlx5_esw_is_fdb_created(esw)) {
> ret = mlx5_eswitch_enable_locked(esw, num_vfs);
> @@ -1723,6 +1724,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
> }
> }
>
> + mlx5_esw_reps_unblock(esw);
> +
> if (toggle_lag)
> mlx5_lag_enable_change(esw->dev);
>
> @@ -1747,6 +1750,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
> esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
> atomic_inc(&esw->generation);
>
> + mlx5_esw_reps_block(esw);
> +
> if (!mlx5_core_is_ecpf(esw->dev)) {
> mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
> if (clear_vf)
> @@ -1757,6 +1762,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
> mlx5_eswitch_clear_ec_vf_vports_info(esw);
> }
>
> + mlx5_esw_reps_unblock(esw);
> +
> if (esw->mode == MLX5_ESWITCH_OFFLOADS) {
> struct devlink *devlink = priv_to_devlink(esw->dev);
>
> @@ -1812,7 +1819,11 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
> devl_assert_locked(priv_to_devlink(esw->dev));
> atomic_inc(&esw->generation);
> mlx5_lag_disable_change(esw->dev);
> +
> + mlx5_esw_reps_block(esw);
> mlx5_eswitch_disable_locked(esw);
> + mlx5_esw_reps_unblock(esw);
> +
> esw->mode = MLX5_ESWITCH_LEGACY;
> mlx5_lag_enable_change(esw->dev);
> }
> @@ -2075,6 +2086,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
> init_rwsem(&esw->mode_lock);
> refcount_set(&esw->qos.refcnt, 0);
> atomic_set(&esw->generation, 0);
> + atomic_set(&esw->offloads.reps_conf_state,
> + MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED);
>
> esw->enabled_vports = 0;
> esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> index e3ab8a30c174..256ac3ad37bc 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> @@ -315,6 +315,7 @@ struct mlx5_esw_offload {
> DECLARE_HASHTABLE(termtbl_tbl, 8);
> struct mutex termtbl_mutex; /* protects termtbl hash */
> struct xarray vhca_map;
> + atomic_t reps_conf_state;
> const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
> u8 inline_mode;
> atomic64_t num_flows;
> @@ -949,6 +950,8 @@ mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw,
> struct mlx5_flow_handle *
> mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num,
> struct mlx5_flow_table *lag_ft);
> +void mlx5_esw_reps_block(struct mlx5_eswitch *esw);
> +void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw);
> #else /* CONFIG_MLX5_ESWITCH */
> /* eswitch API stubs */
> static inline int mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
> @@ -1026,6 +1029,9 @@ mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev)
> return true;
> }
>
> +static inline void mlx5_esw_reps_block(struct mlx5_eswitch *esw) {}
> +static inline void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw) {}
> +
> static inline bool
> mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id)
> {
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 988595e1b425..4b626ffcfa8e 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -2410,23 +2410,56 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw)
> return err;
> }
>
> +static void mlx5_esw_assert_reps_blocked(struct mlx5_eswitch *esw)
> +{
> + if (atomic_read(&esw->offloads.reps_conf_state) ==
> + MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED)
> + return;
> +
> + esw_warn(esw->dev, "reps state machine violated: expected BLOCKED\n");
> +}
> +
> +static void mlx5_esw_mark_reps(struct mlx5_eswitch *esw,
> + enum mlx5_esw_offloads_rep_type_state old,
> + enum mlx5_esw_offloads_rep_type_state new)
> +{
> + atomic_t *reps_conf_state = &esw->offloads.reps_conf_state;
> +
> + do {
> + atomic_cond_read_relaxed(reps_conf_state, VAL == old);
> + } while (atomic_cmpxchg(reps_conf_state, old, new) != old);
> +}
> +
> +void mlx5_esw_reps_block(struct mlx5_eswitch *esw)
> +{
> + mlx5_esw_mark_reps(esw, MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED,
> + MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED);
> +}
> +
> +void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw)
> +{
> + mlx5_esw_mark_reps(esw, MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED,
> + MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED);
> +}
> +
> static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode)
> {
> + mlx5_esw_reps_unblock(esw);
> mlx5_devcom_comp_lock(esw->dev->priv.hca_devcom_comp);
> if (esw->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV ||
> mlx5_core_mp_enabled(esw->dev)) {
> esw->mode = mode;
> - mlx5_rescan_drivers_locked(esw->dev);
> - mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
> - return;
> + goto out;
> }
>
> esw->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> mlx5_rescan_drivers_locked(esw->dev);
> esw->mode = mode;
> esw->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> +out:
> mlx5_rescan_drivers_locked(esw->dev);
> mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
> + mlx5_esw_reps_block(esw);
> }
>
> static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw)
> @@ -2761,6 +2794,8 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
> static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
> struct mlx5_eswitch_rep *rep, u8 rep_type)
> {
> + mlx5_esw_assert_reps_blocked(esw);
> +
> if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
> REP_REGISTERED, REP_LOADED) == REP_REGISTERED)
> return esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
> @@ -2771,6 +2806,8 @@ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
> static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
> struct mlx5_eswitch_rep *rep, u8 rep_type)
> {
> + mlx5_esw_assert_reps_blocked(esw);
> +
> if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
> REP_LOADED, REP_REGISTERED) == REP_LOADED) {
> if (rep_type == REP_ETH)
> @@ -3673,6 +3710,7 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
> if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
> goto free;
>
> + mlx5_esw_reps_block(esw);
> /* Number of VFs can only change from "0 to x" or "x to 0". */
> if (esw->esw_funcs.num_vfs > 0) {
> mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
> @@ -3682,9 +3720,11 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
> err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs,
> MLX5_VPORT_UC_ADDR_CHANGE);
> if (err)
> - goto free;
> + goto unblock;
> }
> esw->esw_funcs.num_vfs = new_num_vfs;
> +unblock:
> + mlx5_esw_reps_unblock(esw);
> free:
> kvfree(out);
> }
> @@ -4164,6 +4204,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
> goto unlock;
> }
>
> + mlx5_esw_reps_block(esw);
> esw->eswitch_operation_in_progress = true;
> up_write(&esw->mode_lock);
>
> @@ -4203,6 +4244,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
> mlx5_devlink_netdev_netns_immutable_set(devlink, false);
> down_write(&esw->mode_lock);
> esw->eswitch_operation_in_progress = false;
> + mlx5_esw_reps_unblock(esw);
> unlock:
> mlx5_esw_unlock(esw);
> enable_lag:
> @@ -4474,9 +4516,10 @@ mlx5_eswitch_vport_has_rep(const struct mlx5_eswitch *esw, u16 vport_num)
> return true;
> }
>
> -void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> - const struct mlx5_eswitch_rep_ops *ops,
> - u8 rep_type)
> +static void
> +mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type)
> {
> struct mlx5_eswitch_rep_data *rep_data;
> struct mlx5_eswitch_rep *rep;
> @@ -4491,9 +4534,20 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> }
> }
> }
> +
> +void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type)
> +{
> + mlx5_esw_reps_block(esw);
> + mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
> + mlx5_esw_reps_unblock(esw);
> +}
> EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
>
> -void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
> +static void
> +mlx5_eswitch_unregister_vport_reps_blocked(struct mlx5_eswitch *esw,
> + u8 rep_type)
> {
> struct mlx5_eswitch_rep *rep;
> unsigned long i;
> @@ -4504,6 +4558,13 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
> mlx5_esw_for_each_rep(esw, i, rep)
> atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED);
> }
> +
> +void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
> +{
> + mlx5_esw_reps_block(esw);
> + mlx5_eswitch_unregister_vport_reps_blocked(esw, rep_type);
> + mlx5_esw_reps_unblock(esw);
> +}
> EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
>
> void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> index c402a8463081..ff2e6f6caa0c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
> @@ -1105,7 +1105,9 @@ int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags)
> struct mlx5_eswitch *esw;
>
> esw = pf->dev->priv.eswitch;
> + mlx5_esw_reps_block(esw);
> ret = mlx5_eswitch_reload_ib_reps(esw);
> + mlx5_esw_reps_unblock(esw);
> if (ret)
> return ret;
> }
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
> index 8503e532f423..2fc69897e35b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
> @@ -245,8 +245,10 @@ static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table,
> if (IS_ERR(sf))
> return PTR_ERR(sf);
>
> + mlx5_esw_reps_block(esw);
> err = mlx5_eswitch_load_sf_vport(esw, sf->hw_fn_id, MLX5_VPORT_UC_ADDR_CHANGE,
> &sf->dl_port, new_attr->controller, new_attr->sfnum);
> + mlx5_esw_reps_unblock(esw);
> if (err)
> goto esw_err;
> *dl_port = &sf->dl_port.dl_port;
> @@ -367,7 +369,10 @@ int mlx5_devlink_sf_port_del(struct devlink *devlink,
> struct mlx5_sf_table *table = dev->priv.sf_table;
> struct mlx5_sf *sf = mlx5_sf_by_dl_port(dl_port);
>
> + mlx5_esw_reps_block(dev->priv.eswitch);
> mlx5_sf_del(table, sf);
> + mlx5_esw_reps_unblock(dev->priv.eswitch);
> +
> return 0;
> }
>
> diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
> index 67256e776566..786b1ea83843 100644
> --- a/include/linux/mlx5/eswitch.h
> +++ b/include/linux/mlx5/eswitch.h
> @@ -29,6 +29,11 @@ enum {
> REP_LOADED,
> };
>
> +enum mlx5_esw_offloads_rep_type_state {
> + MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED,
> + MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED,
> +};
> +
> enum mlx5_switchdev_event {
> MLX5_SWITCHDEV_EVENT_PAIR,
> MLX5_SWITCHDEV_EVENT_UNPAIR,
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH net-next 6/7] net/mlx5: E-switch, load reps via work queue after registration
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
` (4 preceding siblings ...)
2026-04-09 11:55 ` [PATCH net-next 5/7] net/mlx5: E-Switch, block representors during reconfiguration Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 18:02 ` Mark Bloch
2026-04-09 11:55 ` [PATCH net-next 7/7] net/mlx5: Add profile to auto-enable switchdev mode at device init Tariq Toukan
2026-04-09 18:20 ` [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Mark Bloch
7 siblings, 1 reply; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
mlx5_eswitch_register_vport_reps() merely sets the callbacks. The actual
representor load/unload requires devlink locking and shouldn’t run from
the registration context. Queue a work that acquires the devlink lock,
loads all relevant reps. This lets load happen where the needed locks
can be taken.
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../mellanox/mlx5/core/eswitch_offloads.c | 34 +++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 4b626ffcfa8e..279490c0074c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -4535,6 +4535,38 @@ mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
}
}
+static void mlx5_eswitch_reload_reps_blocked(struct mlx5_eswitch *esw)
+{
+ struct mlx5_vport *vport;
+ unsigned long i;
+
+ if (esw->mode != MLX5_ESWITCH_OFFLOADS)
+ return;
+
+ if (mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK))
+ return;
+
+ mlx5_esw_for_each_vport(esw, i, vport) {
+ if (!vport)
+ continue;
+ if (!vport->enabled)
+ continue;
+ if (vport->vport == MLX5_VPORT_UPLINK)
+ continue;
+ if (!mlx5_eswitch_vport_has_rep(esw, vport->vport))
+ continue;
+
+ mlx5_esw_offloads_rep_load(esw, vport->vport);
+ }
+}
+
+static void mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw)
+{
+ mlx5_esw_reps_block(esw);
+ mlx5_eswitch_reload_reps_blocked(esw);
+ mlx5_esw_reps_unblock(esw);
+}
+
void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
const struct mlx5_eswitch_rep_ops *ops,
u8 rep_type)
@@ -4542,6 +4574,8 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
mlx5_esw_reps_block(esw);
mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
mlx5_esw_reps_unblock(esw);
+
+ mlx5_esw_add_work(esw, mlx5_eswitch_reload_reps);
}
EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH net-next 6/7] net/mlx5: E-switch, load reps via work queue after registration
2026-04-09 11:55 ` [PATCH net-next 6/7] net/mlx5: E-switch, load reps via work queue after registration Tariq Toukan
@ 2026-04-09 18:02 ` Mark Bloch
0 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 18:02 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@nvidia.com>
>
> mlx5_eswitch_register_vport_reps() merely sets the callbacks. The actual
> representor load/unload requires devlink locking and shouldn’t run from
> the registration context. Queue a work that acquires the devlink lock,
> loads all relevant reps. This lets load happen where the needed locks
> can be taken.
>
Sashiko writes:
> (Note: Due to system recitation restrictions, this report does not quote the
> original patch code directly. Instead, it provides a free-form summary of
> the questions and observations regarding the changes.)
> commit 8bc15ac3cf1f185bce88ba5499055d1546dae606
> Author: Mark Bloch <mbloch@nvidia.com>
> net/mlx5: E-switch, load reps via work queue after registration
> This commit shifts the representor load operation to an asynchronous
> workqueue so that the necessary devlink locks can be properly acquired,
> avoiding locking issues in the registration context.
> The commit message notes that actual representor load and unload require
> devlink locking and shouldn't run from the registration context. The patch
> successfully defers the load operation, but it appears the unload path
> in mlx5_eswitch_unregister_vport_reps() was not updated. Will
> mlx5_eswitch_unregister_vport_reps_blocked() and __unload_reps_all_vport()
> still be called synchronously without the devlink lock, leaving the
> teardown path unprotected?
This is not worse than the current behavior. The block/unblock mechanism
ensures correctness even without holding the devlink lock, although this
is not ideal.
Fixing the unload path to follow the same model is planned as a follow-up
series
> When mlx5_eswitch_register_vport_reps() queues the asynchronous work item,
> there doesn't seem to be a corresponding cancellation or flush in
> mlx5_eswitch_unregister_vport_reps(). If the registering module
> unregisters the representors and unloads before the work executes, could
> the delayed mlx5_eswitch_reload_reps() run and dereference stale callbacks,
> causing a use-after-free?
Each representor maintains state per rep type, and together with the
E-Switch generation counter we avoid executing work on stale instances.
If the E-Switch generation changes, the queued work will not proceed
with loading representors for the old state.
> The call to mlx5_esw_add_work() to enqueue the representor load operation
> might fail and return an error code like -ENOMEM. Since the return value
> isn't checked in mlx5_eswitch_register_vport_reps(), will a failure
> leave the representors in the REP_REGISTERED state but completely unloaded,
> without notifying the caller?
This change addresses the case where the eswitch is already in
switchdev mode and the IB module is loaded afterwards. Prior to this
patch, no representors would be created in that scenario.
After this change, representors are created when possible, improving the
overall user experience. If workqueue allocation fails (e.g. -ENOMEM),
there is little meaningful recovery we can provide in this context.
> In the new mlx5_eswitch_reload_reps_blocked() function, if loading the
> uplink representor fails, the function returns early. Similarly, any
The uplink representor is a prerequisite for all other representors.
If it fails to load, the rest cannot be meaningfully initialized, so it
is treated as a critical failure.
> failures when loading individual VF representors in the loop are not
> checked. Since this now runs asynchronously, are these errors permanently
> lost, potentially leaving the subsystem in a partially initialized state
> with no fallback?
Failures when loading individual vf representors are intentionally not
treated as fatal. The operation is best-effort, and partial
initialization is acceptable in this context.
Mark
> Signed-off-by: Mark Bloch <mbloch@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
> .../mellanox/mlx5/core/eswitch_offloads.c | 34 +++++++++++++++++++
> 1 file changed, 34 insertions(+)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 4b626ffcfa8e..279490c0074c 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -4535,6 +4535,38 @@ mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
> }
> }
>
> +static void mlx5_eswitch_reload_reps_blocked(struct mlx5_eswitch *esw)
> +{
> + struct mlx5_vport *vport;
> + unsigned long i;
> +
> + if (esw->mode != MLX5_ESWITCH_OFFLOADS)
> + return;
> +
> + if (mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK))
> + return;
> +
> + mlx5_esw_for_each_vport(esw, i, vport) {
> + if (!vport)
> + continue;
> + if (!vport->enabled)
> + continue;
> + if (vport->vport == MLX5_VPORT_UPLINK)
> + continue;
> + if (!mlx5_eswitch_vport_has_rep(esw, vport->vport))
> + continue;
> +
> + mlx5_esw_offloads_rep_load(esw, vport->vport);
> + }
> +}
> +
> +static void mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw)
> +{
> + mlx5_esw_reps_block(esw);
> + mlx5_eswitch_reload_reps_blocked(esw);
> + mlx5_esw_reps_unblock(esw);
> +}
> +
> void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> const struct mlx5_eswitch_rep_ops *ops,
> u8 rep_type)
> @@ -4542,6 +4574,8 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> mlx5_esw_reps_block(esw);
> mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
> mlx5_esw_reps_unblock(esw);
> +
> + mlx5_esw_add_work(esw, mlx5_eswitch_reload_reps);
> }
> EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
>
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH net-next 7/7] net/mlx5: Add profile to auto-enable switchdev mode at device init
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
` (5 preceding siblings ...)
2026-04-09 11:55 ` [PATCH net-next 6/7] net/mlx5: E-switch, load reps via work queue after registration Tariq Toukan
@ 2026-04-09 11:55 ` Tariq Toukan
2026-04-09 18:02 ` Mark Bloch
2026-04-09 18:20 ` [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Mark Bloch
7 siblings, 1 reply; 15+ messages in thread
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
From: Mark Bloch <mbloch@nvidia.com>
Deployments that always operate in switchdev mode currently require
manual devlink configuration after driver probe, which complicates
automated provisioning.
Introduce MLX5_PROF_MASK_DEF_SWITCHDEV, a new profile mask bit, and
profile index 4. When a device is initialized or reloaded with this
profile, the driver automatically switches the e-switch to switchdev
mode by calling mlx5_devlink_eswitch_mode_set() immediately after
bringing the device online.
A no-op stub of mlx5_devlink_eswitch_mode_set() is added for builds
without CONFIG_MLX5_ESWITCH.
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 6 +++++
.../net/ethernet/mellanox/mlx5/core/main.c | 26 ++++++++++++++++++-
include/linux/mlx5/driver.h | 1 +
3 files changed, 32 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 256ac3ad37bc..5dcca59c3125 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -1047,6 +1047,12 @@ mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num,
return ERR_PTR(-EOPNOTSUPP);
}
+static inline int
+mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_MLX5_ESWITCH */
#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index dc7f20a357d9..12f39b4b6c2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -86,7 +86,7 @@ MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec t
static unsigned int prof_sel = MLX5_DEFAULT_PROF;
module_param_named(prof_sel, prof_sel, uint, 0444);
-MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 4");
static u32 sw_owner_id[4];
#define MAX_SW_VHCA_ID (BIT(__mlx5_bit_sz(cmd_hca_cap_2, sw_vhca_id)) - 1)
@@ -185,6 +185,11 @@ static struct mlx5_profile profile[] = {
.log_max_qp = LOG_MAX_SUPPORTED_QPS,
.num_cmd_caches = 0,
},
+ [4] = {
+ .mask = MLX5_PROF_MASK_DEF_SWITCHDEV | MLX5_PROF_MASK_QP_SIZE,
+ .log_max_qp = LOG_MAX_SUPPORTED_QPS,
+ .num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
+ },
};
static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili,
@@ -1451,6 +1456,17 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
mlx5_free_bfreg(dev, &dev->priv.bfreg);
}
+static void mlx5_set_default_switchdev(struct mlx5_core_dev *dev)
+{
+ int err;
+
+ err = mlx5_devlink_eswitch_mode_set(priv_to_devlink(dev),
+ DEVLINK_ESWITCH_MODE_SWITCHDEV,
+ NULL);
+ if (err && err != -EOPNOTSUPP)
+ mlx5_core_warn(dev, "failed setting switchdev as default\n");
+}
+
int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
{
bool light_probe = mlx5_dev_is_lightweight(dev);
@@ -1497,6 +1513,10 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err);
mutex_unlock(&dev->intf_state_mutex);
+
+ if (dev->profile.mask & MLX5_PROF_MASK_DEF_SWITCHDEV)
+ mlx5_set_default_switchdev(dev);
+
return 0;
err_register:
@@ -1598,6 +1618,10 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
goto err_attach;
mutex_unlock(&dev->intf_state_mutex);
+
+ if (dev->profile.mask & MLX5_PROF_MASK_DEF_SWITCHDEV)
+ mlx5_set_default_switchdev(dev);
+
return 0;
err_attach:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1268fcf35ec7..cfbc0ff6292a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -706,6 +706,7 @@ struct mlx5_st;
enum {
MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0,
MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1,
+ MLX5_PROF_MASK_DEF_SWITCHDEV = (u64)1 << 2,
};
enum {
--
2.44.0
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [PATCH net-next 7/7] net/mlx5: Add profile to auto-enable switchdev mode at device init
2026-04-09 11:55 ` [PATCH net-next 7/7] net/mlx5: Add profile to auto-enable switchdev mode at device init Tariq Toukan
@ 2026-04-09 18:02 ` Mark Bloch
0 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 18:02 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@nvidia.com>
>
> Deployments that always operate in switchdev mode currently require
> manual devlink configuration after driver probe, which complicates
> automated provisioning.
>
> Introduce MLX5_PROF_MASK_DEF_SWITCHDEV, a new profile mask bit, and
> profile index 4. When a device is initialized or reloaded with this
> profile, the driver automatically switches the e-switch to switchdev
> mode by calling mlx5_devlink_eswitch_mode_set() immediately after
> bringing the device online.
>
> A no-op stub of mlx5_devlink_eswitch_mode_set() is added for builds
> without CONFIG_MLX5_ESWITCH.
>
> Signed-off-by: Mark Bloch <mbloch@nvidia.com>
> Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
> ---
> .../net/ethernet/mellanox/mlx5/core/eswitch.h | 6 +++++
> .../net/ethernet/mellanox/mlx5/core/main.c | 26 ++++++++++++++++++-
> include/linux/mlx5/driver.h | 1 +
> 3 files changed, 32 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> index 256ac3ad37bc..5dcca59c3125 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
> @@ -1047,6 +1047,12 @@ mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num,
> return ERR_PTR(-EOPNOTSUPP);
> }
>
> +static inline int
> +mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
> + struct netlink_ext_ack *extack)
> +{
> + return -EOPNOTSUPP;
> +}
> #endif /* CONFIG_MLX5_ESWITCH */
>
> #endif /* __MLX5_ESWITCH_H__ */
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> index dc7f20a357d9..12f39b4b6c2a 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
> @@ -86,7 +86,7 @@ MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec t
>
> static unsigned int prof_sel = MLX5_DEFAULT_PROF;
> module_param_named(prof_sel, prof_sel, uint, 0444);
> -MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
> +MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 4");
Sashiko writes:
"Is it appropriate to expand a module parameter to configure a feature that
already has a standard devlink API (devlink dev eswitch set ... mode
switchdev)? Automated provisioning is typically expected to be handled in
userspace rather than configured via driver module parameters."
This is intended as an intermediate step.
The end goal is that for certain environments (e.g. ECPF/DPU), only
switchdev mode is supported and it becomes the default without requiring
user configuration.
There is also the question of multi-NIC systems, where a user may want
only a subset of devices to default to switchdev. This patch does not
aim to solve that. The long-term direction is to tie the behavior to the
NIC type / platform rather than expose it as a generic user-facing knob.
Accordingly, this is not meant to be a general-purpose api. It is intended
for controlled environments (e.g. DPU deployments) where switchdev is
the only supported mode.
>
> static u32 sw_owner_id[4];
> #define MAX_SW_VHCA_ID (BIT(__mlx5_bit_sz(cmd_hca_cap_2, sw_vhca_id)) - 1)
> @@ -185,6 +185,11 @@ static struct mlx5_profile profile[] = {
> .log_max_qp = LOG_MAX_SUPPORTED_QPS,
> .num_cmd_caches = 0,
> },
> + [4] = {
> + .mask = MLX5_PROF_MASK_DEF_SWITCHDEV | MLX5_PROF_MASK_QP_SIZE,
> + .log_max_qp = LOG_MAX_SUPPORTED_QPS,
> + .num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
> + },
> };
>
> static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili,
> @@ -1451,6 +1456,17 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
> mlx5_free_bfreg(dev, &dev->priv.bfreg);
> }
>
> +static void mlx5_set_default_switchdev(struct mlx5_core_dev *dev)
> +{
> + int err;
> +
> + err = mlx5_devlink_eswitch_mode_set(priv_to_devlink(dev),
> + DEVLINK_ESWITCH_MODE_SWITCHDEV,
> + NULL);
Sashiko writes:
"Does calling the internal driver eswitch mode function directly bypass the
devlink core? This appears to prevent the devlink subsystem from emitting
proper netlink state notifications to userspace when the mode transitions.
"
The intent here is to support platforms where switchdev must be the
default (notably DPU/ARM side). Today this transition is handled via
userspace scripts.
This change provides an intermediate step, allowing the device to come
up directly in switchdev mode without relying on userspace orchestration.
It still requires explicit opt-in via the profile.
> + if (err && err != -EOPNOTSUPP)
> + mlx5_core_warn(dev, "failed setting switchdev as default\n");
> +}
> +
> int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
> {
> bool light_probe = mlx5_dev_is_lightweight(dev);
> @@ -1497,6 +1513,10 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
> mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err);
>
> mutex_unlock(&dev->intf_state_mutex);
> +
> + if (dev->profile.mask & MLX5_PROF_MASK_DEF_SWITCHDEV)
> + mlx5_set_default_switchdev(dev);
> +
Sashiko write:
"If a user boots with prof_sel=4 but later manually changes the eswitch mode
to legacy via standard devlink commands, will this call force the device
back into switchdev mode during a firmware recovery or device reload? This
seems to override user-driven runtime devlink configurations."
Given the intended use case, this is acceptable.
These devices are considered “switchdev by default”, so after a firmware
reset or device reload it is expected that they return to switchdev mode.
Preserving a prior user override to legacy mode is not a requirement for
this configuration.
Mark
> return 0;
>
> err_register:
> @@ -1598,6 +1618,10 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
> goto err_attach;
>
> mutex_unlock(&dev->intf_state_mutex);
> +
> + if (dev->profile.mask & MLX5_PROF_MASK_DEF_SWITCHDEV)
> + mlx5_set_default_switchdev(dev);
> +
> return 0;
>
> err_attach:
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 1268fcf35ec7..cfbc0ff6292a 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -706,6 +706,7 @@ struct mlx5_st;
> enum {
> MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0,
> MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1,
> + MLX5_PROF_MASK_DEF_SWITCHDEV = (u64)1 << 2,
> };
>
> enum {
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock
2026-04-09 11:55 [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock Tariq Toukan
` (6 preceding siblings ...)
2026-04-09 11:55 ` [PATCH net-next 7/7] net/mlx5: Add profile to auto-enable switchdev mode at device init Tariq Toukan
@ 2026-04-09 18:20 ` Mark Bloch
7 siblings, 0 replies; 15+ messages in thread
From: Mark Bloch @ 2026-04-09 18:20 UTC (permalink / raw)
To: Tariq Toukan, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Andrew Lunn, David S. Miller
Cc: Saeed Mahameed, Leon Romanovsky, Shay Drory, Or Har-Toov,
Edward Srouji, Maher Sanalla, Simon Horman, Moshe Shemesh,
Kees Cook, Patrisious Haddad, Gerd Bayer, Parav Pandit,
Cosmin Ratiu, Carolina Jubran, netdev, linux-rdma, linux-kernel,
Gal Pressman, Dragos Tatulea
On 09/04/2026 14:55, Tariq Toukan wrote:
> Hi,
>
> See detailed description by Mark below [1].
>
Sashiko flagged a few issues in the series. The main ones are in patch 4
([PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq).
These are known, and I currently consider them acceptable trade-offs rather than bugs.
That said, reviewers/maintainers may reasonably see this differently.
Before sending a v2 focused on patch 4, I’d appreciate feedback on the overall
approach and direction of the series, please see sashiko's comments here:
https://sashiko.dev/#/patchset/20260409115550.156419-1-tariqt%40nvidia.com
I've provided my commetns on each patch on the mailing list and
included what was found by sashiko.
the patch series can be broken into 3:
[patches 2/3/4] – workqueue deadlock:
During teardown we must ensure all pending work is completed.
However, since the teardown path already holds the devlink lock,
we cannot have work items blocking on that same lock without
risking a deadlock.
[patches 5/6] – reps block/unblock state
The interaction between mlx5_core and mlx5_ib (load/unload),
eswitch mode transitions, and auxiliary device handling makes
this particularly tricky.
Several conventional locking/synchronization approaches were
explored, but they either reintroduced deadlocks or created
even more complex issues. The current approach is admittedly
not the cleanest, but it has proven to behave correctly in practice.
[patch 7] – switchdev by default
The long-term goal is to have switchdev as the default mode
for DPU environments. Flipping that behavior globally in one
step is risky.
This patch provides a controlled, incremental way to move in
that direction, allowing validation in real deployments before
making it the default.
Mark
> Regards,
> Tariq
>
> [1]
> This series addresses three problems that have been present for years.
> First, there is no coordination between E-Switch reconfiguration and
> representor registration. The E-Switch can be mid-way through a mode
> change or VF count update while mlx5_ib walks in and registers or
> unregisters representors. Nothing stops them. The race window is small
> and there is no field report, but it is clearly wrong.
>
> A mutex is not the answer. The representor callbacks reach into RDMA,
> netdev, and LAG layers that already hold their own locks, making a
> new mutex in the E-Switch layer a deadlock waiting to happen.
>
> Second, the E-Switch work queue has a deadlock of its own.
> mlx5_eswitch_cleanup() drains the work queue while holding the devlink
> lock. Workers on that queue acquire devlink lock before checking whether
> their work is still relevant. They block. The cleanup path waits for
> them to finish. Deadlock.
>
> Third, loading mlx5_ib while the device is already in switchdev mode
> does not bring up the IB representors. This has been broken for years.
> mlx5_eswitch_register_vport_reps() only stores callbacks; nobody
> triggers the actual load after registration.
>
> For the work queue deadlock: introduce a generation counter in the
> top-level mlx5_eswitch struct (moved from mlx5_esw_functions,
> which only covered function-change events) and a generic dispatch helper
> mlx5_esw_add_work(). The worker esw_wq_handler() checks the counter
> before touching the devlink lock using devl_trylock() in a loop. Stale
> work exits immediately without ever contending. The counter is
> incremented at every E-Switch operation boundary: cleanup, disable,
> mode-set, enable, disable_sriov.
>
> For the registration race: a simple atomic block state guards all
> reconfiguration paths. mlx5_esw_reps_block()/mlx5_esw_reps_unblock()
> spin a cmpxchg between UNBLOCKED and BLOCKED. Every reconfiguration
> path (mode set, enable, disable, VF/SF add/del, LAG reload, and the
> register/unregister calls themselves) brackets its work with this guard.
> No new locks, no deadlock risk.
>
> For the missing IB representors: now that the work queue infrastructure
> is in place, mlx5_eswitch_register_vport_reps() queues a work item that
> acquires the devlink lock and loads all relevant representors. This is
> the change that actually fixes the long-standing bug.
>
> One thing worth calling out: the block guard is non-reentrant. A caller
> that tries to transition UNBLOCKED->BLOCKED while the E-Switch is already
> BLOCKED will spin forever. All call sites were audited:
>
> - mlx5_eswitch_enable/disable/disable_sriov hold BLOCKED only around
> low-level vport helpers that do not call register/unregister.
>
> - Inside mlx5_eswitch_unregister_vport_reps the unload callbacks run
> while BLOCKED is held. The one callback that calls unregister
> (mlx5_ib_vport_rep_unload in LAG shared-FDB mode) only does so on
> peer E-Switch instances, each with its own independent atomic.
>
> - mlx5_devlink_eswitch_mode_set acquires BLOCKED, then calls
> esw_offloads_start/stop -> esw_mode_change. esw_mode_change releases
> BLOCKED before calling rescan_drivers so that the probe/remove
> callbacks that trigger register/unregister see UNBLOCKED.
> esw_mode_change re-acquires before returning, and mode_set releases
> at the end. This is an explicit hand-off of the guard across the
> rescan window.
>
> - mlx5_eswitch_register_vport_reps holds BLOCKED only while storing
> callbacks and queuing the load work. The actual rep loading runs from
> the work queue after the guard is released.
>
> Patch 1 is cleanup. LAG and MPESW had the same representor reload
> sequence duplicated in several places and the copies had started to
> drift. This consolidates them into one helper.
>
> Patches 2-4 fix the work queue deadlock in three steps: first move the
> generation counter from mlx5_esw_functions to mlx5_eswitch;
> then introduce the generic esw_wq_handler/mlx5_esw_add_work dispatch
> infrastructure; then apply the actual fix by switching to devl_trylock
> and adding generation increments at all operation boundaries.
>
> Patch 5 adds the atomic block guard for representor registration,
> protecting all reconfiguration paths.
>
> Patch 6 moves the representor load triggered by
> mlx5_eswitch_register_vport_reps() onto the work queue. This is the
> patch that fixes IB representors not coming up when mlx5_ib is loaded
> while the device is already in switchdev mode.
>
> Patch 7 adds a driver profile that auto-enables switchdev at device
> init, for deployments that always operate in switchdev mode and want
> to avoid a manual devlink command after every probe.
>
> Mark Bloch (7):
> net/mlx5: Lag: refactor representor reload handling
> net/mlx5: E-Switch, move work queue generation counter
> net/mlx5: E-Switch, introduce generic work queue dispatch helper
> net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq
> net/mlx5: E-Switch, block representors during reconfiguration
> net/mlx5: E-switch, load reps via work queue after registration
> net/mlx5: Add profile to auto-enable switchdev mode at device init
>
> .../net/ethernet/mellanox/mlx5/core/eswitch.c | 25 ++-
> .../net/ethernet/mellanox/mlx5/core/eswitch.h | 15 +-
> .../mellanox/mlx5/core/eswitch_offloads.c | 204 ++++++++++++++----
> .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 46 ++--
> .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 1 +
> .../ethernet/mellanox/mlx5/core/lag/mpesw.c | 12 +-
> .../net/ethernet/mellanox/mlx5/core/main.c | 26 ++-
> .../ethernet/mellanox/mlx5/core/sf/devlink.c | 5 +
> include/linux/mlx5/driver.h | 1 +
> include/linux/mlx5/eswitch.h | 5 +
> 10 files changed, 267 insertions(+), 73 deletions(-)
>
>
> base-commit: 9700282a7ec721e285771d995ccfe33845e776dc
^ permalink raw reply [flat|nested] 15+ messages in thread