From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
Edward Srouji <edwards@nvidia.com>,
Simon Horman <horms@kernel.org>,
Maher Sanalla <msanalla@nvidia.com>,
Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
Moshe Shemesh <moshe@nvidia.com>,
Patrisious Haddad <phaddad@nvidia.com>, <netdev@vger.kernel.org>,
<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 11/15] net/mlx5: LAG, introduce software vport LAG implementation
Date: Thu, 4 Jun 2026 14:44:51 +0300 [thread overview]
Message-ID: <20260604114455.434711-12-tariqt@nvidia.com> (raw)
In-Reply-To: <20260604114455.434711-1-tariqt@nvidia.com>
From: Shay Drory <shayd@nvidia.com>
SD LAG is a virtual LAG without hardware LAG support, so it cannot use
the firmware vport LAG commands. Implement a software-based vport LAG
using egress ACL bounce rules.
Add esw_set_slave_egress_rule() to create an egress ACL rule on the
slave's manager vport that bounces traffic to the master's manager
vport. This achieves the same traffic steering as hardware vport LAG.
Redirect mlx5_cmd_create_vport_lag() and mlx5_cmd_destroy_vport_lag()
to the software implementation when operating in SD LAG mode.
In addition, adjust lag_demux creation to check SD LAG mode as well.
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 4 +
.../mellanox/mlx5/core/eswitch_offloads.c | 142 ++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 49 +++++-
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 14 ++
.../mellanox/mlx5/core/lag/shared_fdb.c | 74 ++++++++-
5 files changed, 280 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 94a530d19828..a5f0774834fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -950,6 +950,10 @@ void esw_vport_change_handle_locked(struct mlx5_vport *vport);
bool mlx5_esw_offloads_controller_valid(const struct mlx5_eswitch *esw, u32 controller);
+int mlx5_eswitch_offloads_vport_lag_add_one(struct mlx5_eswitch *master_esw,
+ struct mlx5_eswitch *slave_esw);
+void mlx5_eswitch_offloads_vport_lag_del_one(struct mlx5_eswitch *master_esw,
+ struct mlx5_eswitch *slave_esw);
int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw,
struct mlx5_eswitch *slave_esw, int max_slaves);
void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1133267a53fb..ad812fb1bb80 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3041,6 +3041,136 @@ static int __esw_set_master_egress_rule(struct mlx5_core_dev *master,
return err;
}
+static int esw_slave_egress_create_resources(struct mlx5_eswitch *esw,
+ struct mlx5_vport *vport)
+{
+ struct mlx5_flow_table_attr ft_attr = {
+ .max_fte = 1, .prio = 0, .level = 0,
+ };
+ int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+ struct mlx5_flow_namespace *ns;
+ struct mlx5_flow_table *acl;
+ struct mlx5_flow_group *g;
+ u32 *flow_group_in;
+ int err = 0;
+
+ if (vport->egress.acl)
+ return 0;
+
+ xa_init_flags(&vport->egress.offloads.bounce_rules, XA_FLAGS_ALLOC);
+ ns = mlx5_get_flow_vport_namespace(esw->dev,
+ MLX5_FLOW_NAMESPACE_ESW_EGRESS,
+ vport->index);
+ if (!ns)
+ return -EINVAL;
+
+ flow_group_in = kvzalloc(inlen, GFP_KERNEL);
+ if (!flow_group_in)
+ return -ENOMEM;
+
+ if (vport->vport || mlx5_core_is_ecpf(esw->dev))
+ ft_attr.flags = MLX5_FLOW_TABLE_OTHER_VPORT;
+
+ acl = mlx5_create_vport_flow_table(ns, &ft_attr, vport->vport);
+ if (IS_ERR(acl)) {
+ err = PTR_ERR(acl);
+ goto out;
+ }
+
+ g = mlx5_create_flow_group(acl, flow_group_in);
+ if (IS_ERR(g)) {
+ err = PTR_ERR(g);
+ goto err_table;
+ }
+
+ vport->egress.acl = acl;
+ vport->egress.offloads.bounce_grp = g;
+ vport->egress.type = VPORT_EGRESS_ACL_TYPE_SHARED_FDB;
+ err = 0;
+
+err_table:
+ if (err && !IS_ERR_OR_NULL(acl)) {
+ mlx5_destroy_flow_table(acl);
+ vport->egress.acl = NULL;
+ }
+out:
+ kvfree(flow_group_in);
+ return err;
+}
+
+static void esw_slave_egress_destroy_resources(struct mlx5_vport *vport)
+{
+ if (!IS_ERR_OR_NULL(vport->egress.offloads.bounce_grp)) {
+ mlx5_destroy_flow_group(vport->egress.offloads.bounce_grp);
+ vport->egress.offloads.bounce_grp = NULL;
+ }
+ if (!IS_ERR_OR_NULL(vport->egress.acl)) {
+ esw_acl_egress_ofld_cleanup(vport);
+ xa_destroy(&vport->egress.offloads.bounce_rules);
+ }
+}
+
+static int esw_set_slave_egress_rule(struct mlx5_core_dev *master,
+ struct mlx5_core_dev *slave)
+{
+ struct mlx5_eswitch *slave_esw = slave->priv.eswitch;
+ u16 master_vhca = MLX5_CAP_GEN(master, vhca_id);
+ struct mlx5_flow_destination dest = {};
+ struct mlx5_flow_handle *bounce_rule;
+ struct mlx5_flow_act flow_act = {};
+ struct mlx5_vport *slave_vport;
+ int err;
+
+ slave_vport = mlx5_eswitch_get_vport(slave_esw,
+ slave_esw->manager_vport);
+ if (IS_ERR(slave_vport))
+ return PTR_ERR(slave_vport);
+
+ err = esw_slave_egress_create_resources(slave_esw, slave_vport);
+ if (err)
+ return err;
+
+ flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+ dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+ dest.vport.num = master->priv.eswitch->manager_vport;
+ dest.vport.vhca_id = master_vhca;
+ dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
+
+ bounce_rule = mlx5_add_flow_rules(slave_vport->egress.acl, NULL,
+ &flow_act, &dest, 1);
+ if (IS_ERR(bounce_rule)) {
+ err = PTR_ERR(bounce_rule);
+ goto err_rule;
+ }
+ err = xa_insert(&slave_vport->egress.offloads.bounce_rules,
+ master_vhca, bounce_rule, GFP_KERNEL);
+ if (err)
+ goto err_insert;
+
+ return 0;
+err_insert:
+ mlx5_del_flow_rules(bounce_rule);
+err_rule:
+ esw_slave_egress_destroy_resources(slave_vport);
+ return err;
+}
+
+static void esw_unset_slave_egress_rule(struct mlx5_core_dev *master,
+ struct mlx5_core_dev *slave)
+{
+ struct mlx5_eswitch *slave_esw = slave->priv.eswitch;
+ u16 master_vhca = MLX5_CAP_GEN(master, vhca_id);
+ struct mlx5_vport *slave_vport;
+
+ slave_vport = mlx5_eswitch_get_vport(slave_esw,
+ slave_esw->manager_vport);
+ if (IS_ERR(slave_vport))
+ return;
+
+ esw_acl_egress_ofld_bounce_rule_destroy(slave_vport, master_vhca);
+ esw_slave_egress_destroy_resources(slave_vport);
+}
+
static int esw_master_egress_create_resources(struct mlx5_eswitch *esw,
struct mlx5_flow_namespace *egress_ns,
struct mlx5_vport *vport, size_t count)
@@ -3208,6 +3338,18 @@ void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
esw_unset_master_egress_rule(master_esw->dev, slave_esw->dev);
}
+int mlx5_eswitch_offloads_vport_lag_add_one(struct mlx5_eswitch *master_esw,
+ struct mlx5_eswitch *slave_esw)
+{
+ return esw_set_slave_egress_rule(master_esw->dev, slave_esw->dev);
+}
+
+void mlx5_eswitch_offloads_vport_lag_del_one(struct mlx5_eswitch *master_esw,
+ struct mlx5_eswitch *slave_esw)
+{
+ esw_unset_slave_egress_rule(master_esw->dev, slave_esw->dev);
+}
+
#define ESW_OFFLOADS_DEVCOM_PAIR (0)
#define ESW_OFFLOADS_DEVCOM_UNPAIR (1)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index b660253ffc6d..9566fbf59fdb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -139,9 +139,44 @@ static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, struct mlx5_lag *ldev,
return mlx5_cmd_exec_in(dev, modify_lag, in);
}
+static u32 mlx5_lag_dev_group_id(struct mlx5_core_dev *dev)
+{
+ struct mlx5_lag *ldev = mlx5_lag_dev(dev);
+ struct lag_func *pf;
+ int i;
+
+ if (!ldev)
+ return 0;
+
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (pf->dev == dev)
+ return pf->sd_fdb_active ? pf->group_id : 0;
+ }
+ return 0;
+}
+
+static int mlx5_lag_is_sw_lag(struct mlx5_core_dev *dev)
+{
+ return mlx5_lag_is_sd(dev);
+}
+
int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
{
u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
+ struct mlx5_lag *ldev = mlx5_lag_dev(dev);
+ int ret;
+
+ if (mlx5_lag_is_sw_lag(dev)) {
+ if (!ldev)
+ return -ENODEV;
+
+ mutex_lock(&ldev->lock);
+ ret = mlx5_lag_create_vport_lag(mlx5_lag_dev(dev),
+ mlx5_lag_dev_group_id(dev));
+ mutex_unlock(&ldev->lock);
+ return ret;
+ }
MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
@@ -152,6 +187,18 @@ EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
{
u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
+ struct mlx5_lag *ldev = mlx5_lag_dev(dev);
+
+ if (mlx5_lag_is_sw_lag(dev)) {
+ if (!ldev)
+ return 0;
+
+ mutex_lock(&ldev->lock);
+ mlx5_lag_destroy_vport_lag(mlx5_lag_dev(dev),
+ mlx5_lag_dev_group_id(dev));
+ mutex_unlock(&ldev->lock);
+ return 0;
+ }
MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
@@ -1663,7 +1710,7 @@ int mlx5_lag_demux_init(struct mlx5_core_dev *dev,
xa_init(&pf->lag_demux_rules);
- if (mlx5_get_sd(dev))
+ if (mlx5_lag_is_sw_lag(dev))
return mlx5_lag_demux_ft_fg_init(dev, ft_attr, pf);
return mlx5_lag_demux_fw_init(dev, ft_attr, pf);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index c689f1951cd8..34350b0a7307 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -175,6 +175,8 @@ int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
enum mlx5_lag_mode mode,
u32 group_id);
void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id);
+int mlx5_lag_create_vport_lag(struct mlx5_lag *ldev, u32 group_id);
+int mlx5_lag_destroy_vport_lag(struct mlx5_lag *ldev, u32 group_id);
int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev);
void mlx5_lag_destroy_single_fdb(struct mlx5_lag *ldev);
bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev);
@@ -191,6 +193,18 @@ static inline int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
static inline void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev,
u32 group_id) {}
+static inline int mlx5_lag_create_vport_lag(struct mlx5_lag *ldev,
+ u32 group_id)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int mlx5_lag_destroy_vport_lag(struct mlx5_lag *ldev,
+ u32 group_id)
+{
+ return -EOPNOTSUPP;
+}
+
static inline int mlx5_lag_create_single_fdb(struct mlx5_lag *ldev)
{
return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
index 1371e14c4c13..8d4f2903a101 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
@@ -89,6 +89,76 @@ static int mlx5_lag_create_single_fdb_filter(struct mlx5_lag *ldev, u32 filter)
return err;
}
+int mlx5_lag_create_vport_lag(struct mlx5_lag *ldev, u32 group_id)
+{
+ u32 filter = group_id ? group_id : MLX5_LAG_FILTER_ALL;
+ int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ struct mlx5_eswitch *master_esw;
+ struct mlx5_core_dev *dev0;
+ int i, j;
+ int err;
+
+ if (master_idx < 0)
+ return -EINVAL;
+
+ dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
+ master_esw = dev0->priv.eswitch;
+
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ struct mlx5_eswitch *slave_esw;
+
+ if (i == master_idx)
+ continue;
+
+ slave_esw = mlx5_lag_pf(ldev, i)->dev->priv.eswitch;
+ err = mlx5_eswitch_offloads_vport_lag_add_one(master_esw,
+ slave_esw);
+ if (err)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ mlx5_lag_for_each_reverse(j, i - 1, 0, ldev, filter) {
+ struct mlx5_eswitch *slave_esw;
+
+ if (j == master_idx)
+ continue;
+ slave_esw = mlx5_lag_pf(ldev, j)->dev->priv.eswitch;
+ mlx5_eswitch_offloads_vport_lag_del_one(master_esw, slave_esw);
+ }
+ return err;
+}
+
+int mlx5_lag_destroy_vport_lag(struct mlx5_lag *ldev, u32 group_id)
+{
+ u32 filter = group_id ? group_id : MLX5_LAG_FILTER_ALL;
+ int master_idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
+ filter);
+ struct mlx5_eswitch *master_esw;
+ struct mlx5_core_dev *dev0;
+ int i;
+
+ if (master_idx < 0)
+ return 0;
+
+ dev0 = mlx5_lag_pf(ldev, master_idx)->dev;
+ master_esw = dev0->priv.eswitch;
+
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ struct mlx5_core_dev *dev;
+
+ if (i == master_idx)
+ continue;
+ dev = mlx5_lag_pf(ldev, i)->dev;
+ mlx5_eswitch_offloads_vport_lag_del_one(master_esw,
+ dev->priv.eswitch);
+ }
+ return 0;
+}
+
static void mlx5_lag_destroy_single_fdb_filter(struct mlx5_lag *ldev,
u32 filter)
{
@@ -141,7 +211,7 @@ int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
enum mlx5_lag_mode mode,
u32 group_id)
{
- u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS;
+ u32 filter = group_id ? group_id : MLX5_LAG_FILTER_ALL;
int idx = mlx5_lag_get_dev_index_by_seq_filter(ldev, MLX5_LAG_P1,
filter);
struct mlx5_core_dev *dev0;
@@ -209,7 +279,7 @@ int mlx5_lag_shared_fdb_create(struct mlx5_lag *ldev,
void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id)
{
- u32 filter = group_id ? group_id : MLX5_LAG_FILTER_PORTS;
+ u32 filter = group_id ? group_id : MLX5_LAG_FILTER_ALL;
struct lag_func *pf;
int err;
int i;
--
2.44.0
next prev parent reply other threads:[~2026-06-04 11:46 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-04 11:44 [PATCH net-next 00/15] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 2/2 Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 01/15] net/mlx5: E-Switch, skip uplink IB rep load for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 02/15] net/mlx5: devcom, expose locked variant of send_event Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 03/15] net/mlx5: devcom, add DEVCOM_CANT_FAIL for non-rollback events Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 04/15] net/mlx5: SD, make primary/secondary role determination more robust Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 05/15] net/mlx5: SD, add L2 table silent mode query support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 06/15] net/mlx5: SD, expend vport metadata for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 08/15] net/mlx5: E-Switch, notify SD on eswitch disable Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 09/15] net/mlx5: LAG, store demux resources per master lag_func Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 10/15] net/mlx5: LAG, disable both regular and SD LAG on lag_disable_change Tariq Toukan
2026-06-04 11:44 ` Tariq Toukan [this message]
2026-06-04 11:44 ` [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 13/15] net/mlx5: E-Switch, defer rep load while SD LAG is not active Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 14/15] net/mlx5: SD, defer vport metadata init until SD is ready Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 15/15] net/mlx5: SD, enable SD over ECPF and allow switchdev transition Tariq Toukan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260604114455.434711-12-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=edwards@nvidia.com \
--cc=gal@nvidia.com \
--cc=horms@kernel.org \
--cc=kees@kernel.org \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=msanalla@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=ohartoov@nvidia.com \
--cc=pabeni@redhat.com \
--cc=parav@nvidia.com \
--cc=phaddad@nvidia.com \
--cc=saeedm@nvidia.com \
--cc=shayd@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox