From: Tariq Toukan <tariqt@nvidia.com>
To: Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
Andrew Lunn <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>
Cc: Saeed Mahameed <saeedm@nvidia.com>,
Leon Romanovsky <leon@kernel.org>,
Tariq Toukan <tariqt@nvidia.com>, Mark Bloch <mbloch@nvidia.com>,
Shay Drory <shayd@nvidia.com>, Or Har-Toov <ohartoov@nvidia.com>,
Edward Srouji <edwards@nvidia.com>,
Simon Horman <horms@kernel.org>,
Maher Sanalla <msanalla@nvidia.com>,
Parav Pandit <parav@nvidia.com>, Kees Cook <kees@kernel.org>,
Moshe Shemesh <moshe@nvidia.com>,
Patrisious Haddad <phaddad@nvidia.com>, <netdev@vger.kernel.org>,
<linux-rdma@vger.kernel.org>, <linux-kernel@vger.kernel.org>,
Gal Pressman <gal@nvidia.com>
Subject: [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support
Date: Thu, 4 Jun 2026 14:44:52 +0300 [thread overview]
Message-ID: <20260604114455.434711-13-tariqt@nvidia.com> (raw)
In-Reply-To: <20260604114455.434711-1-tariqt@nvidia.com>
From: Shay Drory <shayd@nvidia.com>
Enable MPESW LAG creation over SD LAG members, forming a composite LAG
hierarchy. This allows bonding multiple SD groups together under a
single MPESW configuration with shared FDB.
When enabling composite MPESW, the individual SD LAG shared FDB
configurations are temporarily torn down and recreated when the
composite LAG is disabled.
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 6 ++
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 8 ++
.../ethernet/mellanox/mlx5/core/lag/mpesw.c | 95 +++++++++++++++++--
.../ethernet/mellanox/mlx5/core/lag/mpesw.h | 4 +
4 files changed, 105 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 9566fbf59fdb..25a9012e3014 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -2545,6 +2545,7 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
struct mlx5_core_dev *primary;
struct mlx5_lag *ldev;
struct lag_func *pf;
+ bool mpesw;
int i;
ldev = mlx5_lag_dev(dev);
@@ -2553,6 +2554,9 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
primary = mlx5_sd_get_primary(dev) ?: dev;
mlx5_devcom_comp_lock(primary->priv.hca_devcom_comp);
+ mpesw = ldev->mode == MLX5_LAG_MODE_MPESW;
+ if (mpesw)
+ mlx5_mpesw_sd_devcoms_lock(ldev);
mutex_lock(&ldev->lock);
ldev->mode_changes_in_progress++;
@@ -2564,6 +2568,8 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
}
mutex_unlock(&ldev->lock);
+ if (mpesw)
+ mlx5_mpesw_sd_devcoms_unlock(ldev);
mlx5_devcom_comp_unlock(primary->priv.hca_devcom_comp);
if (!sd_devcom)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 34350b0a7307..3a90d360d724 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -157,6 +157,14 @@ __mlx5_lag_is_sd(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
return pf && pf->group_id != 0;
}
+static inline bool
+__mlx5_lag_dev_is_port(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+ struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+ return pf && xa_get_mark(&ldev->pfs, pf->idx, MLX5_LAG_XA_MARK_PORT);
+}
+
static inline bool
__mlx5_lag_is_active(struct mlx5_lag *ldev)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index 2cb44084e239..50bfb450c71e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -15,7 +15,7 @@ static void mlx5_mpesw_metadata_cleanup(struct mlx5_lag *ldev)
u32 pf_metadata;
int i;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
dev = mlx5_lag_pf(ldev, i)->dev;
esw = dev->priv.eswitch;
pf_metadata = ldev->lag_mpesw.pf_metadata[i];
@@ -36,7 +36,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
u32 pf_metadata;
int i, err;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
dev = mlx5_lag_pf(ldev, i)->dev;
esw = dev->priv.eswitch;
pf_metadata = mlx5_esw_match_metadata_alloc(esw);
@@ -52,7 +52,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
goto err_metadata;
}
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
dev = mlx5_lag_pf(ldev, i)->dev;
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_MULTIPORT_ESW,
(void *)0);
@@ -65,6 +65,48 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
return err;
}
+static void mlx5_mpesw_restore_sd_fdb(struct mlx5_lag *ldev)
+{
+ struct lag_func *pf;
+ int err, i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ err = mlx5_lag_shared_fdb_create(ldev, NULL, 0, pf->group_id);
+ if (err)
+ mlx5_core_warn(pf->dev,
+ "Failed to restore SD shared FDB (%d)\n",
+ err);
+ }
+}
+
+static int mlx5_mpesw_teardown_sd_fdb(struct mlx5_lag *ldev)
+{
+ struct lag_func *pf;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (!pf->sd_fdb_active)
+ continue;
+ mlx5_lag_shared_fdb_destroy(ldev, pf->group_id);
+ }
+ return 0;
+}
+
+static bool mlx5_lag_has_sd_group(struct mlx5_lag *ldev)
+{
+ struct lag_func *pf;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (pf->group_id)
+ return true;
+ }
+ return false;
+}
+
static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
{
int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@@ -92,10 +134,17 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
if (err)
return err;
+ if (mlx5_lag_has_sd_group(ldev))
+ mlx5_mpesw_teardown_sd_fdb(ldev);
+
err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW,
MLX5_LAG_FILTER_ALL);
if (err) {
- mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err);
+ mlx5_core_warn(dev0,
+ "Failed to create LAG in MPESW mode (%d)\n",
+ err);
+ if (mlx5_lag_has_sd_group(ldev))
+ mlx5_mpesw_restore_sd_fdb(ldev);
mlx5_mpesw_metadata_cleanup(ldev);
return err;
}
@@ -105,9 +154,36 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev)
{
- if (ldev->mode == MLX5_LAG_MODE_MPESW) {
- mlx5_mpesw_metadata_cleanup(ldev);
- mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
+ if (ldev->mode != MLX5_LAG_MODE_MPESW)
+ return;
+
+ mlx5_mpesw_metadata_cleanup(ldev);
+ mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
+ if (mlx5_lag_has_sd_group(ldev))
+ mlx5_mpesw_restore_sd_fdb(ldev);
+}
+
+void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev)
+{
+ struct mlx5_devcom_comp_dev *sd_devcom;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ sd_devcom = mlx5_sd_get_devcom(mlx5_lag_pf(ldev, i)->dev);
+ if (sd_devcom)
+ mlx5_devcom_comp_lock(sd_devcom);
+ }
+}
+
+void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev)
+{
+ struct mlx5_devcom_comp_dev *sd_devcom;
+ int i;
+
+ mlx5_ldev_for_each_reverse(i, MLX5_MAX_PORTS, 0, ldev) {
+ sd_devcom = mlx5_sd_get_devcom(mlx5_lag_pf(ldev, i)->dev);
+ if (sd_devcom)
+ mlx5_devcom_comp_unlock(sd_devcom);
}
}
@@ -122,6 +198,7 @@ static void mlx5_mpesw_work(struct work_struct *work)
return;
mlx5_devcom_comp_lock(devcom);
+ mlx5_mpesw_sd_devcoms_lock(ldev);
mutex_lock(&ldev->lock);
if (ldev->mode_changes_in_progress) {
mpesww->result = -EAGAIN;
@@ -134,6 +211,7 @@ static void mlx5_mpesw_work(struct work_struct *work)
mlx5_lag_disable_mpesw(ldev);
unlock:
mutex_unlock(&ldev->lock);
+ mlx5_mpesw_sd_devcoms_unlock(ldev);
mlx5_devcom_comp_unlock(devcom);
complete(&mpesww->comp);
}
@@ -199,7 +277,8 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev = mlx5_lag_dev(dev);
- return ldev && ldev->mode == MLX5_LAG_MODE_MPESW;
+ return ldev && ldev->mode == MLX5_LAG_MODE_MPESW &&
+ __mlx5_lag_dev_is_port(ldev, dev);
}
EXPORT_SYMBOL(mlx5_lag_is_mpesw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
index b767dbb4f457..5099723ba0f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
@@ -33,8 +33,12 @@ void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev);
int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev);
#ifdef CONFIG_MLX5_ESWITCH
void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev);
+void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev);
+void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev);
#else
static inline void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev) {}
+static inline void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev) {}
+static inline void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev) {}
#endif /* CONFIG_MLX5_ESWITCH */
#ifdef CONFIG_MLX5_ESWITCH
--
2.44.0
next prev parent reply other threads:[~2026-06-04 11:46 UTC|newest]
Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-04 11:44 [PATCH net-next 00/15] net/mlx5: Add switchdev mode support for Socket Direct single netdev, part 2/2 Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 01/15] net/mlx5: E-Switch, skip uplink IB rep load for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 02/15] net/mlx5: devcom, expose locked variant of send_event Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 03/15] net/mlx5: devcom, add DEVCOM_CANT_FAIL for non-rollback events Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 04/15] net/mlx5: SD, make primary/secondary role determination more robust Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 05/15] net/mlx5: SD, add L2 table silent mode query support Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 06/15] net/mlx5: SD, expend vport metadata for SD secondary devices Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 08/15] net/mlx5: E-Switch, notify SD on eswitch disable Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 09/15] net/mlx5: LAG, store demux resources per master lag_func Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 10/15] net/mlx5: LAG, disable both regular and SD LAG on lag_disable_change Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 11/15] net/mlx5: LAG, introduce software vport LAG implementation Tariq Toukan
2026-06-04 11:44 ` Tariq Toukan [this message]
2026-06-04 11:44 ` [PATCH net-next 13/15] net/mlx5: E-Switch, defer rep load while SD LAG is not active Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 14/15] net/mlx5: SD, defer vport metadata init until SD is ready Tariq Toukan
2026-06-04 11:44 ` [PATCH net-next 15/15] net/mlx5: SD, enable SD over ECPF and allow switchdev transition Tariq Toukan
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260604114455.434711-13-tariqt@nvidia.com \
--to=tariqt@nvidia.com \
--cc=andrew+netdev@lunn.ch \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=edwards@nvidia.com \
--cc=gal@nvidia.com \
--cc=horms@kernel.org \
--cc=kees@kernel.org \
--cc=kuba@kernel.org \
--cc=leon@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=mbloch@nvidia.com \
--cc=moshe@nvidia.com \
--cc=msanalla@nvidia.com \
--cc=netdev@vger.kernel.org \
--cc=ohartoov@nvidia.com \
--cc=pabeni@redhat.com \
--cc=parav@nvidia.com \
--cc=phaddad@nvidia.com \
--cc=saeedm@nvidia.com \
--cc=shayd@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox