From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
Nimrod Oren <noren@nvidia.com>, Yael Chemla <ychemla@nvidia.com>,
Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
Edward Srouji <edwards@nvidia.com>,
Maher Sanalla <msanalla@nvidia.com>,
Simon Horman <horms@kernel.org>, Parav Pandit <parav@nvidia.com>,
Patrisious Haddad <phaddad@nvidia.com>,
Kees Cook <kees@kernel.org>, Moshe Shemesh <moshe@nvidia.com>,
<linux-kernel@vger.kernel.org>, <netdev@vger.kernel.org>,
<linux-rdma@vger.kernel.org>, Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 12/13] net/mlx5e: TC, enable steering for SD LAG
Date: Wed, 27 May 2026 15:54:26 +0300 [thread overview]
Message-ID: <20260527125427.385976-13-tariqt@nvidia.com> (raw)
In-Reply-To: <20260527125427.385976-1-tariqt@nvidia.com>
From: Shay Drory <shayd@nvidia.com>
Enable TC flow steering for SD LAG mode by extending multiport
eligibility checks and peer flow handling.
SD LAG operates similarly to MPESW for TC offloads - flows on
secondary devices need peer flow creation on the primary, and
multiport forwarding rules are eligible when either MPESW or SD LAG
is active.
Add mlx5_lag_is_sd() helper to query SD LAG mode, and
mlx5_sd_is_primary() to identify the primary device. Redirect uplink
priv/proto_dev queries to the primary device's eswitch in SD
configurations.
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../ethernet/mellanox/mlx5/core/en/tc_priv.h | 4 ++
.../net/ethernet/mellanox/mlx5/core/en_tc.c | 53 +++++++++++++++++--
.../mellanox/mlx5/core/eswitch_offloads.c | 8 +++
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 14 +++++
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 1 +
.../net/ethernet/mellanox/mlx5/core/lib/sd.c | 15 +++++-
.../net/ethernet/mellanox/mlx5/core/lib/sd.h | 2 +
7 files changed, 92 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
index a0434ceebe69..28cab4bf525c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_priv.h
@@ -104,6 +104,10 @@ struct mlx5e_tc_flow {
* due to missing route)
*/
struct list_head peer_flows; /* flows on peer */
+ int peer_index; /* peer-flow index pinned at add time, used at del
+ * time so removal is independent of LAG state
+ * changes between add and del.
+ */
struct net_device *orig_dev; /* netdev adding flow first */
int tmp_entry_index;
struct list_head tmp_list; /* temporary flow list used by neigh update */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 2a16368a948e..910492eb51f2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -71,6 +71,7 @@
#include <asm/div64.h>
#include "lag/lag.h"
#include "lag/mp.h"
+#include "lib/sd.h"
#define MLX5E_TC_TABLE_NUM_GROUPS 4
#define MLX5E_TC_TABLE_MAX_GROUP_SIZE BIT(18)
@@ -2132,7 +2133,7 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
mutex_unlock(&esw->offloads.peer_mutex);
list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) {
- if (peer_index != mlx5_lag_get_dev_seq(peer_flow->priv->mdev))
+ if (peer_index != peer_flow->peer_index)
continue;
list_del(&peer_flow->peer_flows);
@@ -4196,9 +4197,26 @@ static bool is_lag_dev(struct mlx5e_priv *priv,
same_hw_reps(priv, peer_netdev));
}
+static bool is_sd_eligible(struct mlx5e_priv *priv,
+ struct net_device *peer_netdev)
+{
+ struct mlx5e_priv *peer_priv;
+
+ peer_priv = netdev_priv(peer_netdev);
+ return same_hw_reps(priv, peer_netdev) &&
+ mlx5_lag_is_sd(priv->mdev) &&
+ (mlx5_sd_get_primary(priv->mdev) ==
+ mlx5_sd_get_primary(peer_priv->mdev));
+}
+
static bool is_multiport_eligible(struct mlx5e_priv *priv, struct net_device *out_dev)
{
- return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(priv->mdev);
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(priv->mdev);
+
+ if (!primary)
+ return false;
+
+ return same_hw_reps(priv, out_dev) && mlx5_lag_is_mpesw(primary);
}
bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
@@ -4207,6 +4225,9 @@ bool mlx5e_is_valid_eswitch_fwd_dev(struct mlx5e_priv *priv,
if (is_merged_eswitch_vfs(priv, out_dev))
return true;
+ if (is_sd_eligible(priv, out_dev))
+ return true;
+
if (is_multiport_eligible(priv, out_dev))
return true;
@@ -4351,7 +4372,7 @@ static struct rhashtable *get_tc_ht(struct mlx5e_priv *priv,
return &tc->ht;
}
-static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
+static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow, bool *is_sd)
{
struct mlx5_esw_flow_attr *esw_attr = flow->attr->esw_attr;
struct mlx5_flow_attr *attr = flow->attr;
@@ -4372,6 +4393,13 @@ static bool is_peer_flow_needed(struct mlx5e_tc_flow *flow)
if (mlx5_lag_is_mpesw(esw_attr->in_mdev))
return true;
+ if (mlx5_lag_is_sd(esw_attr->in_mdev) &&
+ !mlx5_sd_is_primary(esw_attr->in_mdev)) {
+ if (!mlx5_lag_is_mpesw(mlx5_sd_get_primary(esw_attr->in_mdev)))
+ *is_sd = true;
+ return true;
+ }
+
return false;
}
@@ -4609,6 +4637,7 @@ static int mlx5e_tc_add_fdb_peer_flow(struct flow_cls_offload *f,
goto out;
}
+ peer_flow->peer_index = i;
list_add_tail(&peer_flow->peer_flows, &flow->peer_flows);
flow_flag_set(flow, DUP);
mutex_lock(&esw->offloads.peer_mutex);
@@ -4628,19 +4657,26 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
struct mlx5e_tc_flow **__flow)
{
struct mlx5_devcom_comp_dev *devcom = priv->mdev->priv.eswitch->devcom, *pos;
+ struct netlink_ext_ack *extack = f->common.extack;
struct mlx5e_rep_priv *rpriv = priv->ppriv;
struct mlx5_eswitch_rep *in_rep = rpriv->rep;
struct mlx5_core_dev *in_mdev = priv->mdev;
struct mlx5_eswitch *peer_esw;
struct mlx5e_tc_flow *flow;
+ bool is_sd = false;
int err;
+ if (mlx5_lag_is_sd(in_mdev) && !mlx5_lag_is_active(in_mdev)) {
+ NL_SET_ERR_MSG_MOD(extack, "SD shared FDB not yet active");
+ return -EOPNOTSUPP;
+ }
+
flow = __mlx5e_add_fdb_flow(priv, f, flow_flags, filter_dev, in_rep,
in_mdev);
if (IS_ERR(flow))
return PTR_ERR(flow);
- if (!is_peer_flow_needed(flow)) {
+ if (!is_peer_flow_needed(flow, &is_sd)) {
*__flow = flow;
return 0;
}
@@ -4651,6 +4687,15 @@ mlx5e_add_fdb_flow(struct mlx5e_priv *priv,
}
mlx5_devcom_for_each_peer_entry(devcom, peer_esw, pos) {
+ if (is_sd) {
+ /* SD shared FDB: only the matching SD primary. */
+ if (mlx5_sd_get_primary(in_mdev) !=
+ mlx5_sd_get_primary(peer_esw->dev))
+ continue;
+ } else {
+ if (!mlx5_sd_is_primary(peer_esw->dev))
+ continue;
+ }
err = mlx5e_tc_add_fdb_peer_flow(f, flow, flow_flags, peer_esw);
if (err)
goto peer_clean;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index d65f30bb2f80..830fc910a080 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -4690,8 +4690,11 @@ EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps_nested);
void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
{
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev);
struct mlx5_eswitch_rep *rep;
+ if (primary)
+ esw = primary->priv.eswitch;
rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
return rep->rep_data[rep_type].priv;
}
@@ -4713,6 +4716,11 @@ EXPORT_SYMBOL(mlx5_eswitch_get_proto_dev);
void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type)
{
+ struct mlx5_core_dev *primary = mlx5_sd_get_primary(esw->dev);
+
+ if (primary)
+ esw = primary->priv.eswitch;
+
return mlx5_eswitch_get_proto_dev(esw, MLX5_VPORT_UPLINK, rep_type);
}
EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index a2c7e2927431..dd3f18f85466 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -2425,6 +2425,20 @@ bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
}
EXPORT_SYMBOL(mlx5_lag_is_sriov);
+bool mlx5_lag_is_sd(struct mlx5_core_dev *dev)
+{
+ struct mlx5_lag *ldev;
+ unsigned long flags;
+ bool res;
+
+ spin_lock_irqsave(&lag_lock, flags);
+ ldev = mlx5_lag_dev(dev);
+ res = ldev && __mlx5_lag_is_sd(ldev, dev);
+ spin_unlock_irqrestore(&lag_lock, flags);
+
+ return res;
+}
+
bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index cbe201529661..e412bb85027c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -202,6 +202,7 @@ static inline bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
}
#endif
bool mlx5_lag_check_prereq(struct mlx5_lag *ldev);
+bool mlx5_lag_is_sd(struct mlx5_core_dev *dev);
int mlx5_lag_demux_init(struct mlx5_core_dev *dev,
struct mlx5_flow_table_attr *ft_attr);
void mlx5_lag_demux_cleanup(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index ec606851feb8..25286ecd724e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -49,13 +49,16 @@ static int mlx5_sd_get_host_buses(struct mlx5_core_dev *dev)
return sd->host_buses;
}
-static struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
+struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev)
{
struct mlx5_sd *sd = mlx5_get_sd(dev);
if (!sd)
return dev;
+ if (!mlx5_devcom_comp_is_ready(sd->devcom))
+ return NULL;
+
return sd->primary ? dev : sd->primary_dev;
}
@@ -69,6 +72,16 @@ struct mlx5_devcom_comp_dev *mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
return sd->devcom;
}
+bool mlx5_sd_is_primary(struct mlx5_core_dev *dev)
+{
+ struct mlx5_sd *sd = mlx5_get_sd(dev);
+
+ if (!sd)
+ return true;
+
+ return sd->primary;
+}
+
struct mlx5_core_dev *
mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
index bf59903ab23f..011702ff6f02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
@@ -10,6 +10,8 @@
struct mlx5_sd;
+struct mlx5_core_dev *mlx5_sd_get_primary(struct mlx5_core_dev *dev);
+bool mlx5_sd_is_primary(struct mlx5_core_dev *dev);
struct mlx5_core_dev *mlx5_sd_primary_get_peer(struct mlx5_core_dev *primary, int idx);
int mlx5_sd_ch_ix_get_dev_ix(struct mlx5_core_dev *dev, int ch_ix);
int mlx5_sd_ch_ix_get_vec_ix(struct mlx5_core_dev *dev, int ch_ix);
--
2.44.0
next prev parent reply other threads:[~2026-05-27 12:56 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-27 12:54 [PATCH net-next 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 01/13] net/mlx5: LAG, factor out shared FDB code into dedicated file Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 02/13] net/mlx5: E-Switch, align disable sequence with switchdev-to-legacy transition Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 03/13] net/mlx5: E-Switch, move devcom init from TC to eswitch layer Tariq Toukan
2026-05-28 18:48 ` Shay Drori
2026-05-27 12:54 ` [PATCH net-next 04/13] net/mlx5: LAG, replace peer count check with direct peer lookup Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 05/13] net/mlx5: LAG, prepare for SD device integration Tariq Toukan
2026-05-28 18:56 ` Shay Drori
2026-05-27 12:54 ` [PATCH net-next 06/13] net/mlx5: LAG, extend shared FDB API with group_id filter Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 07/13] net/mlx5: SD, introduce Socket Direct LAG Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 08/13] net/mlx5: LAG, block RoCE and VF LAG for SD devices Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 09/13] net/mlx5: LAG, block multipath " Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 10/13] net/mlx5: SD, keep netdev resources on same PF in switchdev mode Tariq Toukan
2026-05-27 12:54 ` [PATCH net-next 11/13] net/mlx5e: TC, track peer flow slots with bitmap Tariq Toukan
2026-05-27 12:54 ` Tariq Toukan [this message]
2026-05-27 12:54 ` [PATCH net-next 13/13] net/mlx5e: Verify unique vhca_id count instead of range Tariq Toukan
2026-05-27 22:08 ` [PATCH net-next 00/13] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 1/2 Jacob Keller
2026-05-28 9:18 ` Shay Drori
2026-05-28 17:59 ` Jacob Keller
2026-05-29 0:40 ` Jakub Kicinski
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260527125427.385976-13-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=edwards@nvidia.com \
--cc=gal@nvidia.com \
--cc=horms@kernel.org \
--cc=kees@kernel.org \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=msanalla@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=noren@nvidia.com \
--cc=ohartoov@nvidia.com \
--cc=pabeni@redhat.com \
--cc=parav@nvidia.com \
--cc=phaddad@nvidia.com \
--cc=saeedm@nvidia.com \
--cc=shayd@nvidia.com \
--cc=ychemla@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox