Netdev List
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Nimrod Oren <noren@nvidia.com>, Yael Chemla <ychemla@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
	Edward Srouji <edwards@nvidia.com>,
	Maher Sanalla <msanalla@nvidia.com>,
	Simon Horman <horms@kernel.org>, Parav Pandit <parav@nvidia.com>,
	Patrisious Haddad <phaddad@nvidia.com>,
	Kees Cook <kees@kernel.org>, Moshe Shemesh <moshe@nvidia.com>,
	<linux-kernel@vger.kernel.org>, <netdev@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 05/13] net/mlx5: LAG, prepare for SD device integration
Date: Wed, 27 May 2026 15:54:19 +0300	[thread overview]
Message-ID: <20260527125427.385976-6-tariqt@nvidia.com> (raw)
In-Reply-To: <20260527125427.385976-1-tariqt@nvidia.com>

From: Shay Drory <shayd@nvidia.com>

Socket Direct (SD) secondaries devices will participate in LAG, even
though they are silent. SD secondary devices share the same physical
port as their primary but are separate PCI functions that need to be
tracked alongside regular LAG ports.

Extend lag_func with a group_id field to identify SD group membership
and introduce a unified iterator that can filter by group. Add APIs
for registering SD secondary devices in an existing LAG.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 59 ++++++++++++++-----
 .../net/ethernet/mellanox/mlx5/core/lag/lag.h | 53 +++++++++++++++--
 2 files changed, 90 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 5dfdd799828f..03cb02c7000d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -242,7 +242,7 @@ static void mlx5_ldev_free(struct kref *ref)
 		unregister_netdevice_notifier_net(net, &ldev->nb);
 	}
 
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
 		pf = mlx5_lag_pf(ldev, i);
 		if (pf->port_change_nb.nb.notifier_call) {
 			struct mlx5_nb *nb = &pf->port_change_nb;
@@ -391,7 +391,7 @@ int mlx5_lag_get_dev_seq(struct mlx5_core_dev *dev)
 	if (pf && pf->dev == dev)
 		return 0;
 
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
 		if (i == master_idx)
 			continue;
 		pf = mlx5_lag_pf(ldev, i);
@@ -1034,7 +1034,7 @@ static void mlx5_lag_assert_locked_transition(struct mlx5_lag *ldev)
 
 	lockdep_assert_held(&ldev->lock);
 
-	i = mlx5_get_next_ldev_func(ldev, 0);
+	i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS);
 	if (i < MLX5_MAX_PORTS) {
 		pf = mlx5_lag_pf(ldev, i);
 		devcom = pf->dev->priv.hca_devcom_comp;
@@ -1482,7 +1482,7 @@ struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev)
 	int i;
 
 	mutex_lock(&ldev->lock);
-	i = mlx5_get_next_ldev_func(ldev, 0);
+	i = mlx5_get_next_lag_func(ldev, 0, MLX5_LAG_FILTER_PORTS);
 	if (i < MLX5_MAX_PORTS) {
 		pf = mlx5_lag_pf(ldev, i);
 		devcom = pf->dev->priv.hca_devcom_comp;
@@ -1965,8 +1965,9 @@ static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
 	spin_unlock_irqrestore(&lag_lock, flags);
 }
 
-static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
-			      struct mlx5_core_dev *dev)
+int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
+		       struct mlx5_core_dev *dev,
+		       u32 group_id)
 {
 	struct lag_func *pf;
 	u32 idx;
@@ -1985,8 +1986,14 @@ static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
 
 	pf->idx = idx;
 	pf->dev = dev;
+	pf->group_id = group_id;
 	dev->priv.lag = ldev;
 
+	if (group_id)
+		return 0;
+
+	xa_set_mark(&ldev->pfs, idx, MLX5_LAG_XA_MARK_PORT);
+
 	MLX5_NB_INIT(&pf->port_change_nb,
 		     mlx5_lag_mpesw_port_change_event, PORT_CHANGE);
 	mlx5_eq_notifier_register(dev, &pf->port_change_nb);
@@ -1994,13 +2001,13 @@ static int mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
 	return 0;
 }
 
-static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
-				  struct mlx5_core_dev *dev)
+void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
+			   struct mlx5_core_dev *dev)
 {
 	struct lag_func *pf;
 	int i;
 
-	mlx5_ldev_for_each(i, 0, ldev) {
+	mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
 		pf = mlx5_lag_pf(ldev, i);
 		if (pf->dev == dev)
 			break;
@@ -2035,7 +2042,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 			mlx5_core_err(dev, "Failed to alloc lag dev\n");
 			return 0;
 		}
-		err = mlx5_ldev_add_mdev(ldev, dev);
+		err = mlx5_ldev_add_mdev(ldev, dev, 0);
 		if (err) {
 			mlx5_core_err(dev, "Failed to add mdev to lag dev\n");
 			mlx5_ldev_put(ldev);
@@ -2050,7 +2057,7 @@ static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 		return -EAGAIN;
 	}
 	mlx5_ldev_get(ldev);
-	err = mlx5_ldev_add_mdev(ldev, dev);
+	err = mlx5_ldev_add_mdev(ldev, dev, 0);
 	if (err) {
 		mlx5_ldev_put(ldev);
 		mutex_unlock(&ldev->lock);
@@ -2187,27 +2194,47 @@ void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
 	mlx5_queue_bond_work(ldev, 0);
 }
 
-int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx)
+int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx,
+			  u32 filter)
 {
 	struct lag_func *pf;
 	int i;
 
 	for (i = start_idx; i >= end_idx; i--) {
 		pf = xa_load(&ldev->pfs, i);
-		if (pf && pf->dev)
+		if (!pf || !pf->dev)
+			continue;
+		if (filter == MLX5_LAG_FILTER_PORTS) {
+			if (xa_get_mark(&ldev->pfs, i, MLX5_LAG_XA_MARK_PORT))
+				return i;
+		} else if (filter == MLX5_LAG_FILTER_ALL ||
+			   filter == pf->group_id) {
 			return i;
+		}
 	}
 	return -1;
 }
 
-int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx)
+int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter)
 {
 	struct lag_func *pf;
 	unsigned long idx;
 
-	xa_for_each_start(&ldev->pfs, idx, pf, start_idx)
-		if (pf->dev)
+	if (filter == MLX5_LAG_FILTER_PORTS) {
+		xa_for_each_marked_start(&ldev->pfs, idx, pf,
+					 MLX5_LAG_XA_MARK_PORT, start_idx)
+			if (pf->dev)
+				return idx;
+		return MLX5_MAX_PORTS;
+	}
+
+	xa_for_each_start(&ldev->pfs, idx, pf, start_idx) {
+		if (!pf->dev)
+			continue;
+		if (filter == MLX5_LAG_FILTER_ALL ||
+		    filter == pf->group_id)
 			return idx;
+	}
 	return MLX5_MAX_PORTS;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 23c0457ce799..70baa7997364 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -15,6 +15,13 @@
  * Note: XA_MARK_0 is reserved by XA_FLAGS_ALLOC for free-slot tracking.
  */
 #define MLX5_LAG_XA_MARK_MASTER XA_MARK_1
+/* XArray mark for port-level entries (excludes SD secondaries) */
+#define MLX5_LAG_XA_MARK_PORT   XA_MARK_2
+
+/* Like xa_for_each_marked but starting from a given index */
+#define xa_for_each_marked_start(xa, index, entry, filter, start)	\
+	for (index = start, entry = xa_find(xa, &index, ULONG_MAX, filter); \
+	     entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))
 
 #include "mlx5_core.h"
 #include "mp.h"
@@ -50,6 +57,8 @@ struct lag_func {
 	bool has_drop;
 	unsigned int idx; /* xarray index assigned by LAG */
 	struct mlx5_nb port_change_nb;
+	u32 group_id;        /* SD group ID, 0 = not SD */
+	bool sd_fdb_active;  /* set on all SD group members */
 };
 
 /* Used for collection of netdev event info. */
@@ -125,6 +134,20 @@ mlx5_lag_pf_by_dev_idx(struct mlx5_lag *ldev, int dev_idx)
 	return NULL;
 }
 
+/* Find lag_func by mlx5_core_dev pointer */
+static inline struct lag_func *
+mlx5_lag_pf_by_dev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+	struct lag_func *pf;
+	unsigned long idx;
+
+	xa_for_each(&ldev->pfs, idx, pf) {
+		if (pf->dev == dev)
+			return pf;
+	}
+	return NULL;
+}
+
 static inline bool
 __mlx5_lag_is_active(struct mlx5_lag *ldev)
 {
@@ -214,20 +237,38 @@ static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
 	return true;
 }
 
-#define mlx5_ldev_for_each(i, start_index, ldev) \
-	for (int tmp = start_index; tmp = mlx5_get_next_ldev_func(ldev, tmp), \
+/* Iterator filter constants for mlx5_lag_for_each() */
+#define MLX5_LAG_FILTER_ALL   0        /* iterate ALL devices */
+#define MLX5_LAG_FILTER_PORTS U32_MAX  /* iterate ports only (XA_MARK_PORT) */
+/* any other value = iterate devices with that specific group_id */
+
+#define mlx5_lag_for_each(i, start_index, ldev, filter) \
+	for (int tmp = start_index; \
+	     tmp = mlx5_get_next_lag_func(ldev, tmp, filter), \
 	     i = tmp, tmp < MLX5_MAX_PORTS; tmp++)
 
-#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev)      \
+#define mlx5_lag_for_each_reverse(i, start_index, end_index, ldev, filter) \
 	for (int tmp = start_index, tmp1 = end_index; \
-	     tmp = mlx5_get_pre_ldev_func(ldev, tmp, tmp1), \
+	     tmp = mlx5_get_pre_lag_func(ldev, tmp, tmp1, filter), \
 	     i = tmp, tmp >= tmp1; tmp--)
 
-int mlx5_get_pre_ldev_func(struct mlx5_lag *ldev, int start_idx, int end_idx);
-int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx);
+/* Convenience wrappers - keeps existing behavior */
+#define mlx5_ldev_for_each(i, start_index, ldev) \
+	mlx5_lag_for_each(i, start_index, ldev, MLX5_LAG_FILTER_PORTS)
+
+#define mlx5_ldev_for_each_reverse(i, start_index, end_index, ldev) \
+	mlx5_lag_for_each_reverse(i, start_index, end_index, ldev, \
+				  MLX5_LAG_FILTER_PORTS)
+
+int mlx5_get_pre_lag_func(struct mlx5_lag *ldev, int start_idx, int end_idx,
+			  u32 filter);
+int mlx5_get_next_lag_func(struct mlx5_lag *ldev, int start_idx, u32 filter);
 int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
 int mlx5_lag_num_devs(struct mlx5_lag *ldev);
 int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
 int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
 					bool cont_on_fail);
+int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev,
+		       u32 group_id);
+void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev);
 #endif /* __MLX5_LAG_H__ */
-- 
2.44.0


  parent reply	other threads:[~2026-05-27 12:56 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-27 12:54 [PATCH net-next 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 01/13] net/mlx5: LAG, factor out shared FDB code into dedicated file Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 02/13] net/mlx5: E-Switch, align disable sequence with switchdev-to-legacy transition Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 03/13] net/mlx5: E-Switch, move devcom init from TC to eswitch layer Tariq Toukan
2026-05-28 18:48   ` Shay Drori
2026-05-27 12:54 ` [PATCH net-next 04/13] net/mlx5: LAG, replace peer count check with direct peer lookup Tariq Toukan
2026-05-27 12:54 ` Tariq Toukan [this message]
2026-05-28 18:56   ` [PATCH net-next 05/13] net/mlx5: LAG, prepare for SD device integration Shay Drori
2026-05-27 12:54 ` [PATCH net-next 06/13] net/mlx5: LAG, extend shared FDB API with group_id filter Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 07/13] net/mlx5: SD, introduce Socket Direct LAG Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 08/13] net/mlx5: LAG, block RoCE and VF LAG for SD devices Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 09/13] net/mlx5: LAG, block multipath " Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 10/13] net/mlx5: SD, keep netdev resources on same PF in switchdev mode Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 11/13] net/mlx5e: TC, track peer flow slots with bitmap Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 12/13] net/mlx5e: TC, enable steering for SD LAG Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 13/13] net/mlx5e: Verify unique vhca_id count instead of range Tariq Toukan
2026-05-27 22:08 ` [PATCH net-next 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Jacob Keller
2026-05-28  9:18   ` Shay Drori
2026-05-28 17:59     ` Jacob Keller
2026-05-29  0:40 ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260527125427.385976-6-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=noren@nvidia.com \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    --cc=ychemla@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox