Netdev List
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Nimrod Oren <noren@nvidia.com>, Yael Chemla <ychemla@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
	Edward Srouji <edwards@nvidia.com>,
	Simon Horman <horms@kernel.org>,
	Maher Sanalla <msanalla@nvidia.com>,
	Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
	Patrisious Haddad <phaddad@nvidia.com>,
	Moshe Shemesh <moshe@nvidia.com>, <linux-kernel@vger.kernel.org>,
	<netdev@vger.kernel.org>, <linux-rdma@vger.kernel.org>,
	Gal Pressman <gal@nvidia.com>,
	Jacob Keller <jacob.e.keller@intel.com>
Subject: [PATCH net-next V2 06/13] net/mlx5: LAG, extend shared FDB API with group_id filter
Date: Sun, 31 May 2026 14:39:46 +0300	[thread overview]
Message-ID: <20260531113954.395443-7-tariqt@nvidia.com> (raw)
In-Reply-To: <20260531113954.395443-1-tariqt@nvidia.com>

From: Shay Drory <shayd@nvidia.com>

Add a group_id parameter to mlx5_lag_shared_fdb_create() and
mlx5_lag_shared_fdb_destroy() to scope shared FDB operations to a
specific SD group.

When group_id is U32_MAX, the functions operate on all LAG devices. When
group_id is non-zero, they operate only on devices in that SD group
without issuing FW LAG commands, since SD LAG is a pure software
construct.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 195 ++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/lag/lag.h |  32 ++-
 .../ethernet/mellanox/mlx5/core/lag/mpesw.c   |   7 +-
 .../mellanox/mlx5/core/lag/shared_fdb.c       | 153 +++++++++++---
 .../net/ethernet/mellanox/mlx5/core/lib/sd.c  |  10 +
 .../net/ethernet/mellanox/mlx5/core/lib/sd.h  |  10 +
 6 files changed, 324 insertions(+), 83 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 03cb02c7000d..3decb49e9f19 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -370,6 +370,22 @@ int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq)
 	return -ENOENT;
 }
 
+/* Return the appropriate iterator filter for a device in LAG:
+ * - SD shared FDB active: iterate only the device's SD group
+ * - SD group exists but shared FDB not active: iterate all devices
+ * - No SD: iterate ports only
+ */
+static u32 mlx5_lag_get_filter(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+	struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+	if (pf && pf->sd_fdb_active)
+		return pf->group_id;
+	if (pf && pf->group_id)
+		return MLX5_LAG_FILTER_ALL;
+	return MLX5_LAG_FILTER_PORTS;
+}
+
 /* Reverse of mlx5_lag_get_dev_index_by_seq: given a device, return its
  * sequence number in the LAG. Master is always 0, others numbered
  * sequentially starting from 1.
@@ -379,11 +395,13 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
 	struct mlx5_lag *ldev = mlx5_lag_dev(dev);
 	int master_idx, i, num = 1;
 	struct lag_func *pf;
+	u32 filter;
 
 	if (!ldev)
 		return -ENOENT;
 
-	master_idx = mlx5_lag_get_master_idx(ldev);
+	filter = mlx5_lag_get_filter(ldev, dev);
+	master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, 0, filter);
 	if (master_idx < 0)
 		return -ENOENT;
 
@@ -391,7 +409,7 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
 	if (pf && pf->dev == dev)
 		return 0;
 
-	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
+	mlx5_lag_for_each(i, 0, ldev, filter) {
 		if (i == master_idx)
 			continue;
 		pf = mlx5_lag_pf(ldev, i);
@@ -403,6 +421,69 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_lag_get_dev_seq);
 
+/* seq 0 = master, then all remaining devices */
+static int mlx5_lag_get_dev_index_by_seq_all(struct mlx5_lag *ldev, int seq)
+{
+	int master_idx, i, num = 0;
+
+	master_idx = mlx5_lag_get_master_idx(ldev);
+
+	if (master_idx >= 0) {
+		if (seq == 0)
+			return master_idx;
+		num++;
+	}
+
+	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
+		if (i == master_idx)
+			continue;
+		if (num == seq)
+			return i;
+		num++;
+	}
+	return -ENOENT;
+}
+
+/* From group POV, port-marked entry is the lag master */
+static int mlx5_lag_get_dev_index_by_seq_group(struct mlx5_lag *ldev, int seq,
+					       u32 group_id)
+{
+	int i, num = 0;
+
+	mlx5_lag_for_each(i, 0, ldev, group_id) {
+		if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT)) {
+			if (seq == 0)
+				return i;
+			num++;
+			break;
+		}
+	}
+
+	mlx5_lag_for_each(i, 0, ldev, group_id) {
+		if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT))
+			continue;
+		if (num == seq)
+			return i;
+		num++;
+	}
+	return -ENOENT;
+}
+
+int mlx5_lag_get_dev_index_by_seq_filter(struct mlx5_lag *ldev, int seq,
+					 u32 filter)
+{
+	if (!ldev)
+		return -ENOENT;
+
+	if (!filter || filter == MLX5_LAG_FILTER_PORTS)
+		return mlx5_lag_get_dev_index_by_seq(ldev, seq);
+
+	if (filter == MLX5_LAG_FILTER_ALL)
+		return mlx5_lag_get_dev_index_by_seq_all(ldev, seq);
+
+	return mlx5_lag_get_dev_index_by_seq_group(ldev, seq, filter);
+}
+
 /* Devcom events for LAG master marking */
 #define LAG_DEVCOM_PAIR		(0)
 #define LAG_DEVCOM_UNPAIR	(1)
@@ -512,6 +593,14 @@ static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
 	return ldev->mode == MLX5_LAG_MODE_SRIOV;
 }
 
+static bool __mlx5_lag_is_sd_active(struct mlx5_lag *ldev,
+				    struct mlx5_core_dev *dev)
+{
+	struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+	return pf && pf->sd_fdb_active;
+}
+
 /* Create a mapping between steering slots and active ports.
  * As we have ldev->buckets slots per port first assume the native
  * mapping should be used.
@@ -927,27 +1016,19 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev)
 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
 	bool roce_lag = __mlx5_lag_is_roce(ldev);
 	unsigned long flags = ldev->mode_flags;
-	struct mlx5_eswitch *master_esw;
 	struct mlx5_core_dev *dev0;
 	int err;
-	int i;
 
 	if (master_idx < 0)
 		return -EINVAL;
 
 	dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
-	master_esw = dev0->priv.eswitch;
 	ldev->mode = MLX5_LAG_MODE_NONE;
 	ldev->mode_flags = 0;
 	mlx5_lag_mp_reset(ldev);
 
 	if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
-		mlx5_ldev_for_each(i, 0, ldev) {
-			if (i == master_idx)
-				continue;
-			mlx5_eswitch_offloads_single_fdb_del_one(master_esw,
-								 mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
-		}
+		mlx5_lag_destroy_single_fdb(ldev);
 		clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
 	}
 
@@ -1026,7 +1107,7 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
 	return true;
 }
 
-static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev)
+static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev, u32 filter)
 {
 	struct mlx5_devcom_comp_dev *devcom = NULL;
 	struct lag_func *pf;
@@ -1034,17 +1115,21 @@ static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev)
 
 	lockdep_assert_held(&ldev->lock);
 
-	i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS);
+	i = mlx5_get_next_lag_func(ldev, 0, filter);
 	if (i < MLX5_MAX_PORTS) {
 		pf = mlx5_lag_pf(ldev, i);
-		devcom = pf->dev->priv.hca_devcom_comp;
+		if (filter == MLX5_LAG_FILTER_PORTS ||
+		    filter == MLX5_LAG_FILTER_ALL)
+			devcom = pf->dev->priv.hca_devcom_comp;
+		else
+			devcom = mlx5_sd_get_devcom(pf->dev);
 	}
 	mlx5_devcom_comp_assert_locked(devcom);
 }
 
-static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev)
+static void mlx5_lag_drop_lock_for_reps(struct mlx5_lag *ldev, u32 filter)
 {
-	mlx5_lag_assert_locked_transition(ldev);
+	mlx5_lag_assert_locked_transition(ldev, filter);
 
 	/* Keep PF membership stable while ldev->lock is dropped. Device add
 	 * and remove paths observe mode_changes_in_progress and retry.
@@ -1075,21 +1160,22 @@ void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev,
 	 * callbacks and take reps_lock. Drop ldev->lock so the only ordering
 	 * remains reps_lock -> ldev->lock from representor callbacks.
 	 */
-	mlx5_lag_drop_lock_for_reps(ldev);
+	mlx5_lag_drop_lock_for_reps(ldev, mlx5_lag_get_filter(ldev, dev));
 	mlx5_rescan_drivers_locked(dev);
 	mlx5_lag_retake_lock_after_reps(ldev);
 }
 
-static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable)
+static void mlx5_lag_rescan_devices_locked_filter(struct mlx5_lag *ldev,
+						  bool enable, u32 filter)
 {
 	struct mlx5_core_dev *devs[MLX5_MAX_PORTS];
 	struct lag_func *pf;
 	int num_devs = 0;
 	int i;
 
-	mlx5_lag_assert_locked_transition(ldev);
+	mlx5_lag_assert_locked_transition(ldev, filter);
 
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, filter) {
 		pf = mlx5_lag_pf(ldev, i);
 		if (pf->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
 			continue;
@@ -1101,30 +1187,40 @@ static void mlx5_lag_rescan_devices_locked(struct mlx5_lag *ldev, bool enable)
 		devs[num_devs++] = pf->dev;
 	}
 
-	mlx5_lag_drop_lock_for_reps(ldev);
+	mlx5_lag_drop_lock_for_reps(ldev, filter);
 	for (i = 0; i < num_devs; i++)
 		mlx5_rescan_drivers_locked(devs[i]);
 	mlx5_lag_retake_lock_after_reps(ldev);
 }
 
+void mlx5_lag_add_devices_filter(struct mlx5_lag *ldev, u32 filter)
+{
+	mlx5_lag_rescan_devices_locked_filter(ldev, true, filter);
+}
+
 void mlx5_lag_add_devices(struct mlx5_lag *ldev)
 {
-	mlx5_lag_rescan_devices_locked(ldev, true);
+	mlx5_lag_add_devices_filter(ldev, MLX5_LAG_FILTER_PORTS);
+}
+
+void mlx5_lag_remove_devices_filter(struct mlx5_lag *ldev, u32 filter)
+{
+	mlx5_lag_rescan_devices_locked_filter(ldev, false, filter);
 }
 
 void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
 {
-	mlx5_lag_rescan_devices_locked(ldev, false);
+	mlx5_lag_remove_devices_filter(ldev, MLX5_LAG_FILTER_PORTS);
 }
 
 static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags,
-					    bool cont_on_fail)
+					    u32 filter, bool cont_on_fail)
 {
 	struct lag_func *pf;
 	int ret;
 	int i;
 
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, filter) {
 		pf = mlx5_lag_pf(ldev, i);
 		if (!(pf->dev->priv.flags & flags)) {
 			struct mlx5_eswitch *esw;
@@ -1142,7 +1238,7 @@ static int mlx5_lag_reload_ib_reps_unlocked(struct mlx5_lag *ldev, u32 flags,
 }
 
 static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags,
-				   bool cont_on_fail)
+				   u32 filter, bool cont_on_fail)
 {
 	int ret;
 
@@ -1152,21 +1248,18 @@ static int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags,
 	 * load/unload callbacks can re-enter LAG netdev add/remove and take
 	 * ldev->lock. Keep the ordering reps_lock -> ldev->lock.
 	 */
-	mlx5_lag_drop_lock_for_reps(ldev);
-	ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, cont_on_fail);
+	mlx5_lag_drop_lock_for_reps(ldev, filter);
+	ret = mlx5_lag_reload_ib_reps_unlocked(ldev, flags, filter,
+					       cont_on_fail);
 	mlx5_lag_retake_lock_after_reps(ldev);
 
 	return ret;
 }
 
 int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
-					bool cont_on_fail)
+					u32 filter, bool cont_on_fail)
 {
-	int ret;
-
-	ret = mlx5_lag_reload_ib_reps(ldev, flags, cont_on_fail);
-
-	return ret;
+	return mlx5_lag_reload_ib_reps(ldev, flags, filter, cont_on_fail);
 }
 
 void mlx5_disable_lag(struct mlx5_lag *ldev)
@@ -1182,7 +1275,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
 		return;
 
 	if (shared_fdb) {
-		mlx5_lag_shared_fdb_destroy(ldev);
+		mlx5_lag_shared_fdb_destroy(ldev, 0);
 		return;
 	}
 
@@ -1420,7 +1513,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 
 		if (shared_fdb) {
 			err = mlx5_lag_shared_fdb_create(ldev, &tracker,
-							 MLX5_LAG_MODE_SRIOV);
+							 MLX5_LAG_MODE_SRIOV,
+							 0);
 			if (err)
 				return;
 		} else {
@@ -2261,7 +2355,8 @@ bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
 
 	spin_lock_irqsave(&lag_lock, flags);
 	ldev = mlx5_lag_dev(dev);
-	res  = ldev && __mlx5_lag_is_active(ldev);
+	res  = ldev && (__mlx5_lag_is_active(ldev) ||
+			__mlx5_lag_is_sd_active(ldev, dev));
 	spin_unlock_irqrestore(&lag_lock, flags);
 
 	return res;
@@ -2294,10 +2389,17 @@ bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
 
 	spin_lock_irqsave(&lag_lock, flags);
 	ldev = mlx5_lag_dev(dev);
-	idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
-	if (ldev && __mlx5_lag_is_active(ldev) && idx >= 0) {
-		pf = mlx5_lag_pf(ldev, idx);
-		res = pf && dev == pf->dev;
+	if (ldev) {
+		u32 filter;
+
+		filter = mlx5_lag_get_filter(ldev, dev);
+		idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+							   filter);
+		if ((__mlx5_lag_is_active(ldev) ||
+		     __mlx5_lag_is_sd_active(ldev, dev)) && idx >= 0) {
+			pf = mlx5_lag_pf(ldev, idx);
+			res = pf && dev == pf->dev;
+		}
 	}
 	spin_unlock_irqrestore(&lag_lock, flags);
 
@@ -2324,11 +2426,16 @@ bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev;
 	unsigned long flags;
-	bool res;
+	bool res = false;
 
 	spin_lock_irqsave(&lag_lock, flags);
 	ldev = mlx5_lag_dev(dev);
-	res = ldev && test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
+	if (ldev) {
+		res = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB,
+			       &ldev->mode_flags);
+		if (__mlx5_lag_is_sd(ldev, dev) && !__mlx5_lag_is_active(ldev))
+			res = __mlx5_lag_is_sd_active(ldev, dev);
+	}
 	spin_unlock_irqrestore(&lag_lock, flags);
 
 	return res;
@@ -2429,7 +2536,7 @@ struct mlx5_core_dev *mlx5_lag_get_next_peer_mdev(struct mlx5_core_dev *dev, int
 
 	if (*i == MLX5_MAX_PORTS)
 		goto unlock;
-	mlx5_ldev_for_each(idx, *i, ldev) {
+	mlx5_lag_for_each(idx, *i, ldev, mlx5_lag_get_filter(ldev, dev)) {
 		pf = mlx5_lag_pf(ldev, idx);
 		if (pf->dev != dev)
 			break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 70baa7997364..cbe201529661 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -148,6 +148,14 @@ mlx5_lag_pf_by_dev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
 	return NULL;
 }
 
+static inline bool
+__mlx5_lag_is_sd(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+	struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+	return pf && pf->group_id != 0;
+}
+
 static inline bool
 __mlx5_lag_is_active(struct mlx5_lag *ldev)
 {
@@ -163,25 +171,31 @@ mlx5_lag_is_ready(struct mlx5_lag *ldev)
 #ifdef CONFIG_MLX5_ESWITCH
 int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
 			       struct lag_tracker *tracker,
-			       enum mlx5_lag_mode mode);
-void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev);
+			       enum mlx5_lag_mode mode,
+			       u32 group_id);
+void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id);
 int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev);
+void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev);
 bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev);
+bool mlx5_lag_shared_fdb_supported_filter(struct mlx5_lag *ldev, u32 filter);
 #else
 static inline int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
 					     struct lag_tracker *tracker,
-					     enum mlx5_lag_mode mode)
+					     enum mlx5_lag_mode mode,
+					     u32 group_id)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev) {}
+static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev,
+					       u32 group_id) {}
 
 static inline int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
 {
 	return -EOPNOTSUPP;
 }
 
+static inline void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev) {}
 static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
 {
 	return false;
@@ -211,11 +225,13 @@ void mlx5_ldev_add_debugfs(struct mlx5_core_dev *dev);
 void mlx5_ldev_remove_debugfs(struct dentry *dbg);
 void mlx5_disable_lag(struct mlx5_lag *ldev);
 void mlx5_lag_remove_devices(struct mlx5_lag *ldev);
+void mlx5_lag_remove_devices_filter(struct mlx5_lag *ldev, u32 filter);
 int mlx5_deactivate_lag(struct mlx5_lag *ldev);
 void mlx5_lag_add_devices(struct mlx5_lag *ldev);
 void mlx5_lag_rescan_dev_locked(struct mlx5_lag *ldev,
 				struct mlx5_core_dev *dev,
 				bool enable);
+void mlx5_lag_add_devices_filter(struct mlx5_lag *ldev, u32 filter);
 struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
 
 #ifdef CONFIG_MLX5_ESWITCH
@@ -238,8 +254,8 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
 }
 
 /* Iterator filter constants for mlx5_lag_for_each() */
-#define MLX5_LAG_FILTER_ALL   0        /* iterate ALL devices */
-#define MLX5_LAG_FILTER_PORTS U32_MAX  /* iterate ports only (XA_MARK_PORT) */
+#define MLX5_LAG_FILTER_PORTS 0        /* iterate ports only (XA_MARK_PORT) */
+#define MLX5_LAG_FILTER_ALL   U32_MAX  /* iterate ALL devices */
 /* any other value = iterate devices with that specific group_id */
 
 #define mlx5_lag_for_each(i, start_index, ldev, filter) \
@@ -264,10 +280,12 @@ int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx,
 			  u32 filter);
 int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter);
 int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
+int mlx5_lag_get_dev_index_by_seq_filter(struct mlx5_lag *ldev, int seq,
+					 u32 filter);
 int mlx5_lag_num_devs(struct mlx5_lag *ldev);
 int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
 int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
-					bool cont_on_fail);
+					u32 filter, bool cont_on_fail);
 int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev,
 		       u32 group_id);
 void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index 64e2d1dd5308..2cb44084e239 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -85,14 +85,15 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
 	    !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table) ||
 	    !MLX5_CAP_GEN(dev0, create_lag_when_not_master_up) ||
 	    !mlx5_lag_check_prereq(ldev) ||
-	    !mlx5_lag_shared_fdb_supported(ldev))
+	    !mlx5_lag_shared_fdb_supported_filter(ldev, MLX5_LAG_FILTER_ALL))
 		return -EOPNOTSUPP;
 
 	err = mlx5_mpesw_metadata_set(ldev);
 	if (err)
 		return err;
 
-	err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW);
+	err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW,
+					 MLX5_LAG_FILTER_ALL);
 	if (err) {
 		mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err);
 		mlx5_mpesw_metadata_cleanup(ldev);
@@ -106,7 +107,7 @@ void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev)
 {
 	if (ldev->mode == MLX5_LAG_MODE_MPESW) {
 		mlx5_mpesw_metadata_cleanup(ldev);
-		mlx5_lag_shared_fdb_destroy(ldev);
+		mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
 	}
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
index b5cbe3409720..1371e14c4c13 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
@@ -8,19 +8,19 @@
 #include "lag.h"
 #include "eswitch.h"
 
-bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
+bool mlx5_lag_shared_fdb_supported_filter(struct mlx5_lag *ldev, u32 filter)
 {
+	int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+						       filter);
 	struct mlx5_core_dev *dev0, *dev;
 	bool ret = false;
-	int idx;
 	int i;
 
-	idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
 	if (idx < 0)
 		return false;
 
 	dev0 = mlx5_lag_pf(ldev, idx)->dev;
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, filter) {
 		if (i == idx)
 			continue;
 		dev = mlx5_lag_pf(ldev, i)->dev;
@@ -42,9 +42,16 @@ bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
 	return ret;
 }
 
-int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
+bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
 {
-	int master_idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
+	return mlx5_lag_shared_fdb_supported_filter(ldev,
+						    MLX5_LAG_FILTER_PORTS);
+}
+
+static int mlx5_lag_create_single_fdb_filter(struct mlx5_lag *ldev, u32 filter)
+{
+	int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+							     filter);
 	struct mlx5_eswitch *master_esw;
 	struct mlx5_core_dev *dev0;
 	int i, j;
@@ -55,7 +62,7 @@ int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
 
 	dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
 	master_esw = dev0->priv.eswitch;
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, filter) {
 		struct mlx5_eswitch *slave_esw;
 
 		if (i == master_idx)
@@ -71,7 +78,7 @@ int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
 	}
 	return 0;
 err:
-	mlx5_ldev_for_each_reverse(j, i, 0, ldev) {
+	mlx5_lag_for_each_reverse(j, i, 0, ldev, filter) {
 		struct mlx5_eswitch *slave_esw;
 
 		if (j == master_idx)
@@ -82,59 +89,147 @@ int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
 	return err;
 }
 
+static void mlx5_lag_destroy_single_fdb_filter(struct mlx5_lag *ldev,
+					       u32 filter)
+{
+	int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+							     filter);
+	struct mlx5_eswitch *master_esw;
+	struct mlx5_eswitch *peer_esw;
+	int i;
+
+	if (master_idx < 0)
+		return;
+
+	master_esw = mlx5_lag_pf(ldev, master_idx)->dev->priv.eswitch;
+	mlx5_lag_for_each(i, 0, ldev, filter) {
+		if (i == master_idx)
+			continue;
+
+		peer_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch;
+		mlx5_eswitch_offloads_single_fdb_del_one(master_esw, peer_esw);
+	}
+}
+
+int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
+{
+	return mlx5_lag_create_single_fdb_filter(ldev, MLX5_LAG_FILTER_ALL);
+}
+
+void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev)
+{
+	mlx5_lag_destroy_single_fdb_filter(ldev, MLX5_LAG_FILTER_ALL);
+}
+
+/**
+ * mlx5_lag_shared_fdb_create - Create shared FDB LAG
+ * @ldev: LAG device
+ * @tracker: LAG tracker (NULL for SD)
+ * @mode: LAG mode (unused for SD)
+ * @group_id: SD group ID; 0 (MLX5_LAG_FILTER_PORTS) for ports LAG;
+ *            MLX5_LAG_FILTER_ALL for all-device (mpesw) LAG
+ *
+ * When group_id is 0 (MLX5_LAG_FILTER_PORTS) or MLX5_LAG_FILTER_ALL,
+ * activates a FW LAG with shared FDB.
+ * When group_id is a specific SD group ID, creates a software-only shared
+ * FDB scoped to that group (no FW LAG commands).
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
 int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
 			       struct lag_tracker *tracker,
-			       enum mlx5_lag_mode mode)
+			       enum mlx5_lag_mode mode,
+			       u32 group_id)
 {
-	int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
+	u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS;
+	int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+						       filter);
 	struct mlx5_core_dev *dev0;
+	struct lag_func *pf;
 	int err;
+	int i;
 
 	if (idx < 0)
 		return -EINVAL;
 
 	dev0 = mlx5_lag_pf(ldev, idx)->dev;
 
-	mlx5_lag_remove_devices(ldev);
-
-	err = mlx5_activate_lag(ldev, tracker, mode, true);
-	if (err) {
-		mlx5_core_warn(dev0, "Failed to create LAG in shared FDB mode (%d)\n",
-			       err);
-		goto err_add_devices;
+	mlx5_lag_remove_devices_filter(ldev, filter);
+
+	if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) {
+		err = mlx5_activate_lag(ldev, tracker, mode, true);
+		if (err) {
+			mlx5_core_warn(dev0,
+				       "Failed to create LAG in shared FDB mode (%d)\n",
+				       err);
+			goto err_add_devices;
+		}
+	} else {
+		err = mlx5_lag_create_single_fdb_filter(ldev, group_id);
+		if (err) {
+			mlx5_core_warn(dev0,
+				       "Failed to create SD shared FDB (%d)\n",
+				       err);
+			goto err_add_devices;
+		}
+		mlx5_lag_for_each(i, 0, ldev, filter) {
+			pf = mlx5_lag_pf(ldev, i);
+			pf->sd_fdb_active = true;
+		}
+		BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh);
 	}
 
 	mlx5_lag_rescan_dev_locked(ldev, dev0, true);
-	err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, false);
+	err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, false);
 	if (err) {
 		mlx5_core_err(dev0, "Failed to enable lag\n");
 		goto err_rescan_drivers;
 	}
 
-	mlx5_lag_set_vports_agg_speed(ldev);
+	if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL)
+		mlx5_lag_set_vports_agg_speed(ldev);
 	return 0;
 
 err_rescan_drivers:
 	mlx5_lag_rescan_dev_locked(ldev, dev0, false);
-	mlx5_deactivate_lag(ldev);
+	if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) {
+		mlx5_deactivate_lag(ldev);
+	} else {
+		mlx5_lag_for_each(i, 0, ldev, filter) {
+			pf = mlx5_lag_pf(ldev, i);
+			pf->sd_fdb_active = false;
+		}
+		mlx5_lag_destroy_single_fdb_filter(ldev, group_id);
+	}
 err_add_devices:
-	mlx5_lag_add_devices(ldev);
-	mlx5_lag_reload_ib_reps_from_locked(ldev, 0, true);
+	mlx5_lag_add_devices_filter(ldev, filter);
+	mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, true);
 	return err;
 }
 
-void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev)
+void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id)
 {
+	u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS;
+	struct lag_func *pf;
 	int err;
+	int i;
 
-	mlx5_lag_remove_devices(ldev);
+	mlx5_lag_remove_devices_filter(ldev, filter);
 
-	err = mlx5_deactivate_lag(ldev);
-	if (err)
-		return;
+	if (filter == MLX5_LAG_FILTER_PORTS || filter == MLX5_LAG_FILTER_ALL) {
+		err = mlx5_deactivate_lag(ldev);
+		if (err)
+			return;
+	} else {
+		mlx5_lag_for_each(i, 0, ldev, filter) {
+			pf = mlx5_lag_pf(ldev, i);
+			pf->sd_fdb_active = false;
+		}
+		mlx5_lag_destroy_single_fdb_filter(ldev, group_id);
+	}
 
-	mlx5_lag_add_devices(ldev);
+	mlx5_lag_add_devices_filter(ldev, filter);
 	mlx5_lag_reload_ib_reps_from_locked(ldev,
 					    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV,
-					    true);
+					    filter, true);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index 6e199161b008..bbd77ae11e84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -57,6 +57,16 @@ static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
 	return sd->primary ? dev : sd->primary_dev;
 }
 
+struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sd *sd = mlx5_get_sd(dev);
+
+	if (!sd)
+		return NULL;
+
+	return sd->devcom;
+}
+
 struct mlx5_core_dev *
 mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
index 9bfd5b9756b5..2ab259095d7e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
@@ -21,6 +21,16 @@ void mlx5_sd_put_adev(struct auxiliary_device *actual_adev,
 int mlx5_sd_init(struct mlx5_core_dev *dev);
 void mlx5_sd_cleanup(struct mlx5_core_dev *dev);
 
+#ifdef CONFIG_MLX5_CORE_EN
+struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev);
+#else
+static inline struct mlx5_devcom_comp_dev *
+mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
+{
+	return NULL;
+}
+#endif
+
 #define mlx5_sd_for_each_dev_from_to(i, primary, ix_from, to, pos)	\
 	for (i = ix_from;							\
 	     (pos = mlx5_sd_primary_get_peer(primary, i)) && pos != (to); i++)
-- 
2.44.0


  parent reply	other threads:[~2026-05-31 11:41 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-31 11:39 [PATCH net-next V2 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 01/13] net/mlx5: LAG, factor out shared FDB code into dedicated file Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 02/13] net/mlx5: E-Switch, align disable sequence with switchdev-to-legacy transition Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 03/13] net/mlx5: E-Switch, move devcom init from TC to eswitch layer Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 04/13] net/mlx5: LAG, replace peer count check with direct peer lookup Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 05/13] net/mlx5: LAG, prepare for SD device integration Tariq Toukan
2026-05-31 11:39 ` Tariq Toukan [this message]
2026-05-31 11:39 ` [PATCH net-next V2 07/13] net/mlx5: SD, introduce Socket Direct LAG Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 08/13] net/mlx5: LAG, block RoCE and VF LAG for SD devices Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 09/13] net/mlx5: LAG, block multipath " Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 10/13] net/mlx5: SD, keep netdev resources on same PF in switchdev mode Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 11/13] net/mlx5e: TC, track peer flow slots with bitmap Tariq Toukan
2026-05-31 11:39 ` [PATCH net-next V2 12/13] net/mlx5e: TC, enable steering for SD LAG Tariq Toukan
2026-06-03  2:26   ` Jakub Kicinski
2026-05-31 11:39 ` [PATCH net-next V2 13/13] net/mlx5e: Verify unique vhca_id count instead of range Tariq Toukan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260531113954.395443-7-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=jacob.e.keller@intel.com \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=noren@nvidia.com \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    --cc=ychemla@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox