Netdev List
 help / color / mirror / Atom feed
From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Andrew Lunn <andrew+netdev@lunn.ch>,
	"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
	Leon Romanovsky <leon@kernel.org>,
	Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
	Nimrod Oren <noren@nvidia.com>, Yael Chemla <ychemla@nvidia.com>,
	Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
	Edward Srouji <edwards@nvidia.com>,
	Maher Sanalla <msanalla@nvidia.com>,
	Simon Horman <horms@kernel.org>, Parav Pandit <parav@nvidia.com>,
	Patrisious Haddad <phaddad@nvidia.com>,
	Kees Cook <kees@kernel.org>, Moshe Shemesh <moshe@nvidia.com>,
	<linux-kernel@vger.kernel.org>, <netdev@vger.kernel.org>,
	<linux-rdma@vger.kernel.org>, Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 12/13] net/mlx5e: TC, enable steering for SD LAG
Date: Wed, 27 May 2026 15:54:26 +0300	[thread overview]
Message-ID: <20260527125427.385976-13-tariqt@nvidia.com> (raw)
In-Reply-To: <20260527125427.385976-1-tariqt@nvidia.com>

From: Shay Drory <shayd@nvidia.com>

Enable TC flow steering for SD LAG mode by extending multiport
eligibility checks and peer flow handling.

SD LAG operates similarly to MPESW for TC offloads - flows on
secondary devices need peer flow creation on the primary, and
multiport forwarding rules are eligible when either MPESW or SD LAG
is active.

Add mlx5_lag_is_sd() helper to query SD LAG mode, and
mlx5_sd_is_primary() to identify the primary device. Redirect uplink
priv/proto_dev queries to the primary device's eswitch in SD
configurations.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/en/tc_priv.h  |  4 ++
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 53 +++++++++++++++++--
 .../mellanox/mlx5/core/eswitch_offloads.c     |  8 +++
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 14 +++++
 .../net/ethernet/mellanox/mlx5/core/lag/lag.h |  1 +
 .../net/ethernet/mellanox/mlx5/core/lib/sd.c  | 15 +++++-
 .../net/ethernet/mellanox/mlx5/core/lib/sd.h  |  2 +
 7 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
index a0434ceebe69..28cab4bf525c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
@@ -104,6 +104,10 @@ struct mlx5e_tc_flow {
 				   * due to missing route)
 				   */
 	struct list_head peer_flows; /* flows on peer */
+	int peer_index; /* peer-flow index pinned at add time, used at del
+			 * time so removal is independent of LAG state
+			 * changes between add and del.
+			 */
 	struct net_device *orig_dev; /* netdev adding flow first */
 	int tmp_entry_index;
 	struct list_head tmp_list; /* temporary flow list used by neigh update */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 2a16368a948e..910492eb51f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -71,6 +71,7 @@
 #include <asm/div64.h>
 #include "lag/lag.h"
 #include "lag/mp.h"
+#include "lib/sd.h"
 
 #define MLX5E_TC_TABLE_NUM_GROUPS 4
 #define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18)
@@ -2132,7 +2133,7 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
 	mutex_unlock(&esw->offloads.peer_mutex);
 
 	list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) {
-		if (peer_index != mlx5_lag_get_dev_seq(peer_flow->priv->mdev))
+		if (peer_index != peer_flow->peer_index)
 			continue;
 
 		list_del(&peer_flow->peer_flows);
@@ -4196,9 +4197,26 @@ static bool is_lag_dev(struct mlx5e_priv *priv,
 		 same_hw_reps(priv, peer_netdev));
 }
 
+static bool is_sd_eligible(struct mlx5e_priv *priv,
+			   struct net_device *peer_netdev)
+{
+	struct mlx5e_priv *peer_priv;
+
+	peer_priv = netdev_priv(peer_netdev);
+	return same_hw_reps(priv, peer_netdev) &&
+		mlx5_lag_is_sd(priv->mdev) &&
+		(mlx5_sd_get_primary(priv->mdev) ==
+		 mlx5_sd_get_primary(peer_priv->mdev));
+}
+
 static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev)
 {
-	return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(priv->mdev);
+	struct mlx5_core_dev *primary = mlx5_sd_get_primary(priv->mdev);
+
+	if (!primary)
+		return false;
+
+	return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(primary);
 }
 
 bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
@@ -4207,6 +4225,9 @@ bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
 	if (is_merged_eswitch_vfs(priv, out_dev))
 		return true;
 
+	if (is_sd_eligible(priv, out_dev))
+		return true;
+
 	if (is_multiport_eligible(priv, out_dev))
 		return true;
 
@@ -4351,7 +4372,7 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv,
 		return &tc->ht;
 }
 
-static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
+static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow, bool *is_sd)
 {
 	struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
 	struct mlx5_flow_attr *attr = flow->attr;
@@ -4372,6 +4393,13 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
 	if (mlx5_lag_is_mpesw(esw_attr->in_mdev))
 		return true;
 
+	if (mlx5_lag_is_sd(esw_attr->in_mdev) &&
+	    !mlx5_sd_is_primary(esw_attr->in_mdev)) {
+		if (!mlx5_lag_is_mpesw(mlx5_sd_get_primary(esw_attr->in_mdev)))
+			*is_sd = true;
+		return true;
+	}
+
 	return false;
 }
 
@@ -4609,6 +4637,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f,
 		goto out;
 	}
 
+	peer_flow->peer_index = i;
 	list_add_tail(&peer_flow->peer_flows, &flow->peer_flows);
 	flow_flag_set(flow, DUP);
 	mutex_lock(&esw->offloads.peer_mutex);
@@ -4628,19 +4657,26 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 		   struct mlx5e_tc_flow **__flow)
 {
 	struct mlx5_devcom_comp_dev *devcom = priv->mdev->priv.eswitch->devcom, *pos;
+	struct netlink_ext_ack *extack = f->common.extack;
 	struct mlx5e_rep_priv *rpriv = priv->ppriv;
 	struct mlx5_eswitch_rep *in_rep = rpriv->rep;
 	struct mlx5_core_dev *in_mdev = priv->mdev;
 	struct mlx5_eswitch *peer_esw;
 	struct mlx5e_tc_flow *flow;
+	bool is_sd = false;
 	int err;
 
+	if (mlx5_lag_is_sd(in_mdev) && !mlx5_lag_is_active(in_mdev)) {
+		NL_SET_ERR_MSG_MOD(extack, "SD shared FDB not yet active");
+		return -EOPNOTSUPP;
+	}
+
 	flow = __mlx5e_add_fdb_flow(priv, f, flow_flags, filter_dev, in_rep,
 				    in_mdev);
 	if (IS_ERR(flow))
 		return PTR_ERR(flow);
 
-	if (!is_peer_flow_needed(flow)) {
+	if (!is_peer_flow_needed(flow, &is_sd)) {
 		*__flow = flow;
 		return 0;
 	}
@@ -4651,6 +4687,15 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
 	}
 
 	mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) {
+		if (is_sd) {
+			/* SD shared FDB: only the matching SD primary. */
+			if (mlx5_sd_get_primary(in_mdev) !=
+			    mlx5_sd_get_primary(peer_esw->dev))
+				continue;
+		} else {
+			if (!mlx5_sd_is_primary(peer_esw->dev))
+				continue;
+		}
 		err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags, peer_esw);
 		if (err)
 			goto peer_clean;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index d65f30bb2f80..830fc910a080 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -4690,8 +4690,11 @@ EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps_nested);
 
 void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
 {
+	struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev);
 	struct mlx5_eswitch_rep *rep;
 
+	if (primary)
+		esw = primary->priv.eswitch;
 	rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
 	return rep->rep_data[rep_type].priv;
 }
@@ -4713,6 +4716,11 @@ EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
 
 void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type)
 {
+	struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev);
+
+	if (primary)
+		esw = primary->priv.eswitch;
+
 	return mlx5_eswitch_get_proto_dev(esw, MLX5_VPORT_UPLINK, rep_type);
 }
 EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index a2c7e2927431..dd3f18f85466 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -2425,6 +2425,20 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
 }
 EXPORT_SYMBOL(mlx5_lag_is_sriov);
 
+bool mlx5_lag_is_sd(struct mlx5_core_dev *dev)
+{
+	struct mlx5_lag *ldev;
+	unsigned long flags;
+	bool res;
+
+	spin_lock_irqsave(&lag_lock, flags);
+	ldev = mlx5_lag_dev(dev);
+	res  = ldev && __mlx5_lag_is_sd(ldev, dev);
+	spin_unlock_irqrestore(&lag_lock, flags);
+
+	return res;
+}
+
 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
 {
 	struct mlx5_lag *ldev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index cbe201529661..e412bb85027c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -202,6 +202,7 @@ static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
 }
 #endif
 bool mlx5_lag_check_prereq(struct mlx5_lag *ldev);
+bool mlx5_lag_is_sd(struct mlx5_core_dev *dev);
 int mlx5_lag_demux_init(struct mlx5_core_dev *dev,
 			struct mlx5_flow_table_attr *ft_attr);
 void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index ec606851feb8..25286ecd724e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -49,13 +49,16 @@ static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev)
 	return sd->host_buses;
 }
 
-static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
+struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
 {
 	struct mlx5_sd *sd = mlx5_get_sd(dev);
 
 	if (!sd)
 		return dev;
 
+	if (!mlx5_devcom_comp_is_ready(sd->devcom))
+		return NULL;
+
 	return sd->primary ? dev : sd->primary_dev;
 }
 
@@ -69,6 +72,16 @@ struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
 	return sd->devcom;
 }
 
+bool mlx5_sd_is_primary(struct mlx5_core_dev *dev)
+{
+	struct mlx5_sd *sd = mlx5_get_sd(dev);
+
+	if (!sd)
+		return true;
+
+	return sd->primary;
+}
+
 struct mlx5_core_dev *
 mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
index bf59903ab23f..011702ff6f02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
@@ -10,6 +10,8 @@
 
 struct mlx5_sd;
 
+struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev);
+bool mlx5_sd_is_primary(struct mlx5_core_dev *dev);
 struct mlx5_core_dev *mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx);
 int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix);
 int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix);
-- 
2.44.0


  parent reply	other threads:[~2026-05-27 12:56 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-27 12:54 [PATCH net-next 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 01/13] net/mlx5: LAG, factor out shared FDB code into dedicated file Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 02/13] net/mlx5: E-Switch, align disable sequence with switchdev-to-legacy transition Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 03/13] net/mlx5: E-Switch, move devcom init from TC to eswitch layer Tariq Toukan
2026-05-28 18:48   ` Shay Drori
2026-05-27 12:54 ` [PATCH net-next 04/13] net/mlx5: LAG, replace peer count check with direct peer lookup Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 05/13] net/mlx5: LAG, prepare for SD device integration Tariq Toukan
2026-05-28 18:56   ` Shay Drori
2026-05-27 12:54 ` [PATCH net-next 06/13] net/mlx5: LAG, extend shared FDB API with group_id filter Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 07/13] net/mlx5: SD, introduce Socket Direct LAG Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 08/13] net/mlx5: LAG, block RoCE and VF LAG for SD devices Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 09/13] net/mlx5: LAG, block multipath " Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 10/13] net/mlx5: SD, keep netdev resources on same PF in switchdev mode Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 11/13] net/mlx5e: TC, track peer flow slots with bitmap Tariq Toukan
2026-05-27 12:54 ` Tariq Toukan [this message]
2026-05-27 12:54 ` [PATCH net-next 13/13] net/mlx5e: Verify unique vhca_id count instead of range Tariq Toukan
2026-05-27 22:08 ` [PATCH net-next 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Jacob Keller
2026-05-28  9:18   ` Shay Drori
2026-05-28 17:59     ` Jacob Keller
2026-05-29  0:40 ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260527125427.385976-13-tariqt@nvidia.com \
    --to=tariqt@nvidia.com \
    --cc=andrew+netdev@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=edwards@nvidia.com \
    --cc=gal@nvidia.com \
    --cc=horms@kernel.org \
    --cc=kees@kernel.org \
    --cc=kuba@kernel.org \
    --cc=leon@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=mbloch@nvidia.com \
    --cc=moshe@nvidia.com \
    --cc=msanalla@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=noren@nvidia.com \
    --cc=ohartoov@nvidia.com \
    --cc=pabeni@redhat.com \
    --cc=parav@nvidia.com \
    --cc=phaddad@nvidia.com \
    --cc=saeedm@nvidia.com \
    --cc=shayd@nvidia.com \
    --cc=ychemla@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox