public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Leon Romanovsky <leon@kernel.org>, Jason Gunthorpe <jgg@ziepe.ca>,
	"Saeed Mahameed" <saeedm@nvidia.com>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
	Edward Srouji <edwards@nvidia.com>,
	Simon Horman <horms@kernel.org>,
	Maher Sanalla <msanalla@nvidia.com>,
	Parav Pandit <parav@nvidia.com>,
	Patrisious Haddad <phaddad@nvidia.com>,
	Kees Cook <kees@kernel.org>, Gerd Bayer <gbayer@linux.ibm.com>,
	Moshe Shemesh <moshe@nvidia.com>,
	Carolina Jubran <cjubran@nvidia.com>,
	Cosmin Ratiu <cratiu@nvidia.com>, <linux-rdma@vger.kernel.org>,
	<linux-kernel@vger.kernel.org>, <netdev@vger.kernel.org>,
	Gal Pressman <gal@nvidia.com>,
	Dragos Tatulea <dtatulea@nvidia.com>
Subject: [PATCH net-next V3 4/7] net/mlx5: Lag, avoid LAG and representor lock cycles
Date: Sun, 3 May 2026 23:27:23 +0300	[thread overview]
Message-ID: <20260503202726.266415-5-tariqt@nvidia.com> (raw)
In-Reply-To: <20260503202726.266415-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

The LAG shared-FDB and multiport E-Switch transitions rescan auxiliary
devices and reload IB representors while holding ldev->lock. Driver
bind/unbind paths may register or unregister E-Switch representor ops, and
representor load paths may enter LAG code, so holding ldev->lock across
those calls creates lock-order cycles with the E-Switch representor lock.

Keep the devcom component locked for the transition, but drop ldev->lock
before rescanning auxiliary devices or reloading IB representors. Mark the
LAG transition as in progress while the lock is dropped and assert the
devcom lock where the helper relies on it. This preserves LAG serialization
while avoiding ldev->lock nesting under E-Switch representor registration.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 142 ++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/lag/lag.h |   7 +-
 .../ethernet/mellanox/mlx5/core/lag/mpesw.c   |  10 +-
 .../ethernet/mellanox/mlx5/core/lib/devcom.c  |   8 +
 .../ethernet/mellanox/mlx5/core/lib/devcom.h  |   1 +
 5 files changed, 134 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index a474f970e056..e77f9931c39c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1063,37 +1063,99 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
 	return true;
 }
 
-void mlx5_lag_add_devices(struct mlx5_lag *ldev)
+static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev)
 {
+	struct mlx5_devcom_comp_dev *devcom = NULL;
 	struct lag_func *pf;
 	int i;
 
-	mlx5_ldev_for_each(i, 0, ldev) {
-		pf = mlx5_lag_pf(ldev, i);
-		if (pf->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
-			continue;
+	lockdep_assert_held(&ldev->lock);
 
-		pf->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-		mlx5_rescan_drivers_locked(pf->dev);
+	i = mlx5_get_next_ldev_func(ldev, 0);
+	if (i < MLX5_MAX_PORTS) {
+		pf = mlx5_lag_pf(ldev, i);
+		devcom = pf->dev->priv.hca_devcom_comp;
 	}
+	mlx5_devcom_comp_assert_locked(devcom);
 }
 
-void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
+static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev)
+{
+	mlx5_lag_assert_locked_transition(ldev);
+
+	/* Keep PF membership stable while ldev->lock is dropped. Device add
+	 * and remove paths observe mode_changes_in_progress and retry.
+	 */
+	ldev->mode_changes_in_progress++;
+	mutex_unlock(&ldev->lock);
+}
+
+static void mlx5_lag_retake_lock_after_reps(struct mlx5_lag *ldev)
 {
+	mutex_lock(&ldev->lock);
+	ldev->mode_changes_in_progress--;
+}
+
+void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev,
+				struct mlx5_core_dev *dev,
+				bool enable)
+{
+	if (dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
+		return;
+
+	if (enable)
+		dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+	else
+		dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+
+	/* Auxiliary bus probe/remove can register or unregister representor
+	 * callbacks and take reps_lock. Drop ldev->lock so the only ordering
+	 * remains reps_lock -> ldev->lock from representor callbacks.
+	 */
+	mlx5_lag_drop_lock_for_reps(ldev);
+	mlx5_rescan_drivers_locked(dev);
+	mlx5_lag_retake_lock_after_reps(ldev);
+}
+
+static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable)
+{
+	struct mlx5_core_dev *devs[MLX5_MAX_PORTS];
 	struct lag_func *pf;
+	int num_devs = 0;
 	int i;
 
+	mlx5_lag_assert_locked_transition(ldev);
+
 	mlx5_ldev_for_each(i, 0, ldev) {
 		pf = mlx5_lag_pf(ldev, i);
 		if (pf->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
 			continue;
 
-		pf->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-		mlx5_rescan_drivers_locked(pf->dev);
+		if (enable)
+			pf->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+		else
+			pf->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+		devs[num_devs++] = pf->dev;
 	}
+
+	mlx5_lag_drop_lock_for_reps(ldev);
+	for (i = 0; i < num_devs; i++)
+		mlx5_rescan_drivers_locked(devs[i]);
+	mlx5_lag_retake_lock_after_reps(ldev);
 }
 
-int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags, bool cont_on_fail)
+void mlx5_lag_add_devices(struct mlx5_lag *ldev)
+{
+	mlx5_lag_rescan_devices_locked(ldev, true);
+}
+
+void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
+{
+	mlx5_lag_rescan_devices_locked(ldev, false);
+}
+
+static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags,
+					    bool cont_on_fail)
 {
 	struct lag_func *pf;
 	int ret;
@@ -1105,7 +1167,9 @@ int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags, bool cont_on_fail)
 			struct mlx5_eswitch *esw;
 
 			esw = pf->dev->priv.eswitch;
+			mlx5_esw_reps_block(esw);
 			ret = mlx5_eswitch_reload_ib_reps(esw);
+			mlx5_esw_reps_unblock(esw);
 			if (ret && !cont_on_fail)
 				return ret;
 		}
@@ -1114,6 +1178,34 @@ int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags, bool cont_on_fail)
 	return 0;
 }
 
+static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags,
+				   bool cont_on_fail)
+{
+	int ret;
+
+	/* The HCA devcom component lock serializes LAG mode transitions while
+	 * ldev->lock is dropped here. Dropping ldev->lock is required because
+	 * the reload takes the per-E-Switch reps_lock, and representor
+	 * load/unload callbacks can re-enter LAG netdev add/remove and take
+	 * ldev->lock. Keep the ordering reps_lock -> ldev->lock.
+	 */
+	mlx5_lag_drop_lock_for_reps(ldev);
+	ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, cont_on_fail);
+	mlx5_lag_retake_lock_after_reps(ldev);
+
+	return ret;
+}
+
+int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
+					bool cont_on_fail)
+{
+	int ret;
+
+	ret = mlx5_lag_reload_ib_reps(ldev, flags, cont_on_fail);
+
+	return ret;
+}
+
 void mlx5_disable_lag(struct mlx5_lag *ldev)
 {
 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
@@ -1132,10 +1224,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
 	if (shared_fdb) {
 		mlx5_lag_remove_devices(ldev);
 	} else if (roce_lag) {
-		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
-			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-			mlx5_rescan_drivers_locked(dev0);
-		}
+		mlx5_lag_rescan_dev_locked(ldev, dev0, false);
 		mlx5_ldev_for_each(i, 0, ldev) {
 			if (i == idx)
 				continue;
@@ -1151,8 +1240,9 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
 		mlx5_lag_add_devices(ldev);
 
 	if (shared_fdb)
-		mlx5_lag_reload_ib_reps(ldev, MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV,
-					true);
+		mlx5_lag_reload_ib_reps_from_locked(ldev,
+						    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV,
+						    true);
 }
 
 bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
@@ -1409,7 +1499,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 			if (shared_fdb || roce_lag)
 				mlx5_lag_add_devices(ldev);
 			if (shared_fdb)
-				mlx5_lag_reload_ib_reps(ldev, 0, true);
+				mlx5_lag_reload_ib_reps_from_locked(ldev, 0,
+								    true);
 
 			return;
 		}
@@ -1417,8 +1508,7 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 		if (roce_lag) {
 			struct mlx5_core_dev *dev;
 
-			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-			mlx5_rescan_drivers_locked(dev0);
+			mlx5_lag_rescan_dev_locked(ldev, dev0, true);
 			mlx5_ldev_for_each(i, 0, ldev) {
 				if (i == idx)
 					continue;
@@ -1427,15 +1517,15 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 					mlx5_nic_vport_enable_roce(dev);
 			}
 		} else if (shared_fdb) {
-			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-			mlx5_rescan_drivers_locked(dev0);
-			err = mlx5_lag_reload_ib_reps(ldev, 0, false);
+			mlx5_lag_rescan_dev_locked(ldev, dev0, true);
+			err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0,
+								  false);
 			if (err) {
-				dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-				mlx5_rescan_drivers_locked(dev0);
+				mlx5_lag_rescan_dev_locked(ldev, dev0, false);
 				mlx5_deactivate_lag(ldev);
 				mlx5_lag_add_devices(ldev);
-				mlx5_lag_reload_ib_reps(ldev, 0, true);
+				mlx5_lag_reload_ib_reps_from_locked(ldev, 0,
+								    true);
 				mlx5_core_err(dev0, "Failed to enable lag\n");
 				return;
 			}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index daca8ebd5256..6afe7707d076 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -164,6 +164,9 @@ void mlx5_disable_lag(struct mlx5_lag *ldev);
 void mlx5_lag_remove_devices(struct mlx5_lag *ldev);
 int mlx5_deactivate_lag(struct mlx5_lag *ldev);
 void mlx5_lag_add_devices(struct mlx5_lag *ldev);
+void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev,
+				struct mlx5_core_dev *dev,
+				bool enable);
 struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
 
 #ifdef CONFIG_MLX5_ESWITCH
@@ -199,6 +202,6 @@ int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx);
 int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
 int mlx5_lag_num_devs(struct mlx5_lag *ldev);
 int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
-int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags,
-			    bool cont_on_fail);
+int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
+					bool cont_on_fail);
 #endif /* __MLX5_LAG_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index edcd06f3be7a..8a349f8fd823 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -100,9 +100,8 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
 		goto err_add_devices;
 	}
 
-	dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-	mlx5_rescan_drivers_locked(dev0);
-	err = mlx5_lag_reload_ib_reps(ldev, 0, false);
+	mlx5_lag_rescan_dev_locked(ldev, dev0, true);
+	err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, false);
 	if (err)
 		goto err_rescan_drivers;
 
@@ -111,12 +110,11 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
 	return 0;
 
 err_rescan_drivers:
-	dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
-	mlx5_rescan_drivers_locked(dev0);
+	mlx5_lag_rescan_dev_locked(ldev, dev0, false);
 	mlx5_deactivate_lag(ldev);
 err_add_devices:
 	mlx5_lag_add_devices(ldev);
-	mlx5_lag_reload_ib_reps(ldev, 0, true);
+	mlx5_lag_reload_ib_reps_from_locked(ldev, 0, true);
 	mlx5_mpesw_metadata_cleanup(ldev);
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index 4b5ac2db55ce..d40c53193ea8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -3,6 +3,7 @@
 
 #include <linux/mlx5/vport.h>
 #include <linux/list.h>
+#include <linux/lockdep.h>
 #include "lib/devcom.h"
 #include "lib/mlx5.h"
 #include "mlx5_core.h"
@@ -438,3 +439,10 @@ int mlx5_devcom_comp_trylock(struct mlx5_devcom_comp_dev *devcom)
 		return 0;
 	return down_write_trylock(&devcom->comp->sem);
 }
+
+void mlx5_devcom_comp_assert_locked(struct mlx5_devcom_comp_dev *devcom)
+{
+	if (!devcom)
+		return;
+	lockdep_assert_held_write(&devcom->comp->sem);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
index 91e5ae529d5c..316052a85ca5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
@@ -75,5 +75,6 @@ void *mlx5_devcom_get_next_peer_data_rcu(struct mlx5_devcom_comp_dev *devcom,
 void mlx5_devcom_comp_lock(struct mlx5_devcom_comp_dev *devcom);
 void mlx5_devcom_comp_unlock(struct mlx5_devcom_comp_dev *devcom);
 int mlx5_devcom_comp_trylock(struct mlx5_devcom_comp_dev *devcom);
+void mlx5_devcom_comp_assert_locked(struct mlx5_devcom_comp_dev *devcom);
 
 #endif /* __LIB_MLX5_DEVCOM_H__ */
-- 
2.44.0


  parent reply	other threads:[~2026-05-03 20:28 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-03 20:27 [PATCH net-next V3 0/7] net/mlx5: Improve representor lifecycle and late IB representor loading Tariq Toukan
2026-05-03 20:27 ` [PATCH net-next V3 1/7] net/mlx5: Lag: refactor representor reload handling Tariq Toukan
2026-05-03 20:27 ` [PATCH net-next V3 2/7] net/mlx5: E-Switch, let esw work callers choose GFP flags Tariq Toukan
2026-05-03 20:27 ` [PATCH net-next V3 3/7] net/mlx5: E-Switch, add representor lifecycle lock Tariq Toukan
2026-05-03 20:27 ` Tariq Toukan [this message]
2026-05-03 20:27 ` [PATCH net-next V3 5/7] net/mlx5: E-Switch, serialize representor lifecycle Tariq Toukan
2026-05-03 20:27 ` [PATCH net-next V3 6/7] net/mlx5: E-Switch, unwind only newly loaded representor types Tariq Toukan
2026-05-03 20:27 ` [PATCH net-next V3 7/7] net/mlx5: E-Switch, load reps via work queue after registration Tariq Toukan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260503202726.266415-5-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=cjubran@nvidia.com \
    --cc=cratiu@nvidia.com \
    --cc=davem@davemloft.net \
    --cc=dtatulea@nvidia.com \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=gbayer@linux.ibm.com \
    --cc=horms@kernel.org \
    --cc=jgg@ziepe.ca \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox